Differential expression analysis of LCM RNA Data

Managing Packages Using Renv

To run this code in my project using the renv environment, run the following lines of code

install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile

Load packages

require("genefilter")
require("DESeq2")
require("apeglm")
require("ashr")
require("ggplot2")
require("vsn")
require("hexbin")
require("pheatmap")
require("RColorBrewer")
require("EnhancedVolcano")
require("rtracklayer")
require("tidyverse")

sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] stringr_1.5.1               dplyr_1.1.4                
##  [5] purrr_1.0.2                 readr_2.1.5                
##  [7] tidyr_1.3.1                 tibble_3.2.1               
##  [9] tidyverse_2.0.0             rtracklayer_1.62.0         
## [11] EnhancedVolcano_1.18.0      ggrepel_0.9.6              
## [13] RColorBrewer_1.1-3          pheatmap_1.0.12            
## [15] hexbin_1.28.5               vsn_3.68.0                 
## [17] ggplot2_3.5.1               ashr_2.2-63                
## [19] apeglm_1.22.1               DESeq2_1.40.2              
## [21] SummarizedExperiment_1.30.2 Biobase_2.60.0             
## [23] MatrixGenerics_1.12.3       matrixStats_1.4.1          
## [25] GenomicRanges_1.54.1        GenomeInfoDb_1.36.4        
## [27] IRanges_2.34.1              S4Vectors_0.38.2           
## [29] BiocGenerics_0.46.0         genefilter_1.82.1          
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3                bitops_1.0-9             rlang_1.1.4             
##  [4] magrittr_2.0.3           compiler_4.3.2           RSQLite_2.3.9           
##  [7] png_0.1-8                vctrs_0.6.5              pkgconfig_2.0.3         
## [10] crayon_1.5.3             fastmap_1.2.0            XVector_0.40.0          
## [13] utf8_1.2.4               Rsamtools_2.18.0         rmarkdown_2.28          
## [16] tzdb_0.4.0               preprocessCore_1.62.1    bit_4.5.0               
## [19] xfun_0.48                zlibbioc_1.46.0          cachem_1.1.0            
## [22] jsonlite_1.8.9           blob_1.2.4               DelayedArray_0.26.7     
## [25] BiocParallel_1.34.2      irlba_2.3.5.1            parallel_4.3.2          
## [28] R6_2.5.1                 stringi_1.8.4            bslib_0.8.0             
## [31] limma_3.56.2             SQUAREM_2021.1           jquerylib_0.1.4         
## [34] numDeriv_2016.8-1.1      Rcpp_1.0.13-1            knitr_1.48              
## [37] timechange_0.3.0         Matrix_1.6-5             splines_4.3.2           
## [40] tidyselect_1.2.1         rstudioapi_0.17.0        abind_1.4-8             
## [43] yaml_2.3.10              codetools_0.2-20         affy_1.78.2             
## [46] lattice_0.22-6           plyr_1.8.9               withr_3.0.1             
## [49] KEGGREST_1.40.1          coda_0.19-4.1            evaluate_1.0.1          
## [52] survival_3.7-0           Biostrings_2.70.3        pillar_1.9.0            
## [55] affyio_1.70.0            BiocManager_1.30.25      renv_1.0.11             
## [58] generics_0.1.3           invgamma_1.1             RCurl_1.98-1.16         
## [61] truncnorm_1.0-9          emdbook_1.3.13           hms_1.1.3               
## [64] munsell_0.5.1            scales_1.3.0             xtable_1.8-4            
## [67] glue_1.8.0               tools_4.3.2              BiocIO_1.12.0           
## [70] GenomicAlignments_1.38.2 annotate_1.78.0          locfit_1.5-9.10         
## [73] mvtnorm_1.3-2            XML_3.99-0.17            grid_4.3.2              
## [76] bbmle_1.0.25.1           bdsmatrix_1.3-7          AnnotationDbi_1.64.1    
## [79] colorspace_2.1-1         GenomeInfoDbData_1.2.10  restfulr_0.0.15         
## [82] cli_3.6.3                fansi_1.0.6              mixsqp_0.3-54           
## [85] S4Arrays_1.0.6           gtable_0.3.5             sass_0.4.9              
## [88] digest_0.6.37            SparseArray_1.2.4        rjson_0.2.23            
## [91] memoise_2.0.1            htmltools_0.5.8.1        lifecycle_1.0.4         
## [94] httr_1.4.7               bit64_4.5.2              MASS_7.3-60.0.1
#set standard output directory for figures
outdir <- "../output_RNA/differential_expression"

save_ggplot <- function(plot, filename, width = 10, height = 7, units = "in", dpi = 300,bg=NULL) {
  print(plot)

  png_path <- file.path(outdir, paste0(filename, ".png"))
  pdf_dir <- file.path(outdir, "pdf_figs")
  pdf_path <- file.path(pdf_dir, paste0(filename, ".pdf"))
  
  # Ensure the pdf_figs directory exists
  if (!dir.exists(pdf_dir)) dir.create(pdf_dir, recursive = TRUE)
  
  # Save plots
  ggsave(filename = png_path, plot = plot, width = width, height = height, units = units, dpi = dpi,bg = bg)
  ggsave(filename = pdf_path, plot = plot, width = width, height = height, units = units, dpi = dpi,bg = bg)
}

# Specify colors
ann_colors = list(Tissue = c(OralEpi = "palegreen3" ,Aboral = "mediumpurple1"))

Read and clean count matrix and metadata

Read in raw count data

counts_raw <- read.csv("../output_RNA/stringtie-GeneExt/LCM_RNA_gene_count_matrix.csv", row.names = 1) #load in data

gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9

Read in metadata

meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv") %>%
            dplyr::arrange(Sample) %>%
            mutate(across(c(Tissue, Fragment, Section_Date, LCM_Date), factor)) # Set variables as factors 

meta$Tissue <- factor(meta$Tissue, levels = c("OralEpi","Aboral")) #we want OralEpi to be the baseline

Read in gff for gene coordinates

gff <- import("../references/Pocillopora_acuta_HIv2.genes.gff3")
gff_transcripts <- as.data.frame(gff) %>% filter(type == "transcript") %>%
                                          select(c(seqnames,start,end,width,strand,ID)) %>%
                                          dplyr::rename("chromosome"=seqnames, "query"=ID)

Data sanity checks!

stopifnot(all(meta$Sample %in% colnames(counts_raw))) #are all of the sample names in the metadata column names in the gene count matrix?
stopifnot(all(meta$Sample == colnames(counts_raw))) #are they the same in the same order?

pOverA filtering to reduce dataset

ffun<-filterfun(pOverA(0.49,10))  # Keep genes expressed at 10+ counts in at least 50% of samples
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter

filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter

paste0("Number of genes after filtering: ", sum(counts_filt_poa))
## [1] "Number of genes after filtering: 14464"
write.csv(filtered_counts, file = file.path(outdir, "filtered_counts.csv"))

There are now 14464 genes in the filtered dataset.

Data sanity checks:

all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts))  #are they the same in the same order? Should be TRUE
## [1] TRUE

DESeq2

Create DESeq object and run DESeq2

dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
                              colData = meta,
                              design= ~ Fragment + Tissue)

dds <- DESeq(dds)

### Extract results for Aboral vs. OralEpi contrast

res <- results(dds, contrast = c("Tissue","Aboral","OralEpi"))
resLFC <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", res=res, type = "apeglm")

Extract results for adjusted p-value < 0.05

res <- resLFC

resOrdered <- res[order(res$pvalue),]# save differentially expressed genes

DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05 & abs(log2FoldChange) > 1)
DE_05_Up <- DE_05 %>% filter(log2FoldChange > 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange < 0) #Lower in Aboral, Higher in OralEpi

nrow(DE_05)
## [1] 1774
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 558
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 1216
write.csv(as.data.frame(resOrdered), 
          file = file.path(outdir, "DESeq_results.csv"))

write.csv(DE_05, 
          file = file.path(outdir, "DEG_05.csv"))

Visualizing Differential Expression

EnhancedVolcano(res, 
    lab = ifelse(res$padj < 0.05, rownames(res), ""),
    x = "log2FoldChange", 
    y = "pvalue"
)

Plots

plotMA(results(dds, contrast = c("Tissue","Aboral","OralEpi")), ylim=c(-20,20))

plotMA(res, ylim=c(-20,20))

Log2 Fold Change Comparison

resultsNames(dds)
## [1] "Intercept"                "Fragment_B_vs_A"         
## [3] "Fragment_C_vs_A"          "Fragment_D_vs_A"         
## [5] "Fragment_E_vs_A"          "Tissue_Aboral_vs_OralEpi"
# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", type="normal")
resAsh <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", type="ashr")

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-20,20)
plotMA(res, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")

plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")

Transforming count data for visualization

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)

meanSdPlot(assay(vsd), main = "vsd")

meanSdPlot(assay(rld))

meanSdPlot(assay(ntd))

#save the vsd transformation
vsd_mat <- assay(vsd)
write.csv(vsd_mat, file = file.path(outdir, "vsd_expression_matrix.csv"))

Will move forward with vst transformation for visualizations

Heatmap of count matrix

heatmap_metadata <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)),
         annotation_colors = ann_colors,color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)),
         annotation_colors = ann_colors, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

#view most significantly differentially expressed genes

select <- order(res$padj)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2, annotation_col=(heatmap_metadata%>% select(Tissue)),
         annotation_colors = ann_colors,color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

Principal component plot of the samples

pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE, ntop = 14464)

percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "PCA_allgenes")

PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

ggsave(filename = paste0(outdir,"/PCA_allgenes_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "PCA")

PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

ggsave(filename = paste0(outdir,"/PCA_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)

Clearly, the majority of the variance in the data is explained by tissue type!

Annotation data

Download annotation files from genome website


# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)

CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)

KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())

write.csv(as.data.frame(DE_05_eggnog), file=paste0(outdir,"/DE_05_eggnog_annotation.csv"))

annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)

eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
  separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
  
#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "top50_DE")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "up_Aboral")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "up_OralEpi")

Genes of Interest

Single cell marker genes (Levy et al 2021)

MarkerGenes <- read.csv("../references/Pacuta_MarkerGenes_Levy2021.csv") %>% dplyr::rename("query" = 1, "List" = 2, "definition" = 3) %>% filter(List !="Toolkit")

MarkerGenes_broc <- read.csv("../output_RNA/marker_genes/Pacuta_Spis_Markers_pairs.csv") %>% select(protein_id_spB,cluster,Standardized_Name_spA ) %>% dplyr::rename("query" = 1, "List" = 2)

MarkerGenes$def_short <- ifelse(nchar(MarkerGenes$definition) > 20, 
                            paste0(substr(MarkerGenes$definition, 1, 17), "..."), 
                            MarkerGenes$definition)

My marker genes (p compressa)

Pcomp_MarkerGenes_broc <- read.csv("../output_RNA/marker_genes/combined_Pcomp_markers.csv") %>% select(protein_id_spA,cluster_guess_std,protein_id_spB) %>% dplyr::rename("Pcomp_gene" = protein_id_spA,"query" = protein_id_spB, "List" = cluster_guess_std) 

Pcomp_MarkerGenes_broc <- Pcomp_MarkerGenes_broc %>% filter(grepl("Pocillopora",query)&!grepl("unidentified",List)) %>% distinct() 

Pcomp_MarkerGenes_broc_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(Pcomp_MarkerGenes_broc) 

Pcomp_MarkerGenes_broc_all_res_Pcomp <- Pcomp_MarkerGenes_broc_all_res %>%
  group_by(Pcomp_gene) %>%
  slice_min(order_by = padj, with_ties = FALSE) %>%
  ungroup()

Pcomp_MarkerGenes_broc_all_res_Pacuta <- Pcomp_MarkerGenes_broc_all_res %>% group_by(query,baseMean,log2FoldChange,lfcSE,pvalue,padj) %>% summarize(List = paste(List, collapse = ";"), .groups="drop")

write.csv(Pcomp_MarkerGenes_broc_all_res_Pcomp, "../output_RNA/differential_expression/Pcomp_snRNA_marker_expr_PacutaLCM.csv")
Pdam_MarkerGenes_broc <- read.csv("../output_RNA/marker_genes/combined_Pdam_markers.csv") %>% select(protein_id_spA,Standardized_Name_spA,protein_id_spB) %>% dplyr::rename("Pdam_gene" = protein_id_spA,"query" = protein_id_spB, "List" = Standardized_Name_spA) 

Pdam_MarkerGenes_broc <- Pdam_MarkerGenes_broc %>% filter(grepl("Pocillopora",query)&!grepl("unidentified",List)) %>% distinct() 

Pdam_MarkerGenes_broc_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% 
  select(query,everything()) %>% left_join(Pdam_MarkerGenes_broc) 

Pdam_MarkerGenes_broc_all_res_Pdam <- Pdam_MarkerGenes_broc_all_res %>%
  group_by(Pdam_gene) %>%
  slice_min(order_by = padj, with_ties = FALSE) %>%
  ungroup()

Pdam_MarkerGenes_broc_all_res_Pacuta <- Pdam_MarkerGenes_broc_all_res %>% group_by(query,baseMean,log2FoldChange,lfcSE,pvalue,padj) %>% summarize(List = paste(List, collapse = ";"), .groups="drop")

write.csv(Pdam_MarkerGenes_broc_all_res_Pdam, "../output_RNA/differential_expression/Pdam_snRNA_marker_expr_PacutaLCM.csv")
Mcap_MarkerGenes_broc <- read.csv("../output_RNA/marker_genes/combined_Mcap_markers.csv") %>% select(protein_id_spA,cluster_guess_std,protein_id_spB) %>% dplyr::rename("Mcap_gene" = protein_id_spA,"query" = protein_id_spB, "List" = cluster_guess_std) 

Mcap_MarkerGenes_broc <- Mcap_MarkerGenes_broc %>% filter(grepl("Pocillopora",query)&!grepl("unidentified",List)) %>% distinct() 

Mcap_MarkerGenes_broc_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% 
  select(query,everything()) %>% left_join(Mcap_MarkerGenes_broc) 

Mcap_MarkerGenes_broc_all_res_Mcap <- Mcap_MarkerGenes_broc_all_res %>%
  group_by(Mcap_gene) %>%
  slice_min(order_by = padj, with_ties = FALSE) %>%
  ungroup()

Mcap_MarkerGenes_broc_all_res_Pacuta <- Mcap_MarkerGenes_broc_all_res %>% group_by(query,baseMean,log2FoldChange,lfcSE,pvalue,padj) %>% summarize(List = paste(List, collapse = ";"), .groups="drop")

write.csv(Mcap_MarkerGenes_broc_all_res_Mcap, "../output_RNA/differential_expression/Mcap_snRNA_marker_expr_PacutaLCM.csv")

Biomineralization toolkit

Biomin <- read.csv("../output_RNA/marker_genes/Pacuta_Biomin_Blast.csv") %>% dplyr::rename("query" = Pocillopora_acuta_best_hit) %>% select(-c(accessionnumber.geneID, Ref))
Biomin_broc <- read.csv("../output_RNA/marker_genes/Pacuta_Biomin_Spis_ortholog.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X,accessionnumber_gene_id, ref))

Biomin <- Biomin %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","))

Biomin$def_short <- ifelse(nchar(Biomin$definition) > 40, 
                            paste0(substr(Biomin$definition, 1, 37), "..."), 
                            Biomin$definition)

Biomin_filtered_counts <- filtered_counts[(rownames(filtered_counts) %in% Biomin$query),]

Biomin_broc <- Biomin_broc %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","))

Biomin_broc$def_short <- ifelse(nchar(Biomin_broc$definition) > 40, 
                            paste0(substr(Biomin_broc$definition, 1, 37), "..."), 
                            Biomin_broc$definition)

Biomin_broc_filtered_counts <- filtered_counts[(rownames(filtered_counts) %in% Biomin_broc$query),]

write.csv(Biomin_filtered_counts, "../output_RNA/differential_expression/Biomin_filtered_counts.csv")

Additional gene lists

He_etal_Nvec = He, S., Shao, W., Chen, S. (Cynthia), Wang, T. and Gibson, M. C. (2023). Spatial transcriptomics reveals a cnidarian segment polarity program in Nematostella vectensis. Current Biology 33, 2678-2689.e5.

DuBuc_etal_Nvec = DuBuc, T. Q., Stephenson, T. B., Rock, A. Q. and Martindale, M. Q. (2018). Hox and Wnt pattern the primary body axis of an anthozoan cnidarian before gastrulation. Nat Commun 9, 2007.

  • oral/aboral patterning markers in nematostella
He_etal_Nvec <- read.csv("../output_RNA/marker_genes/He_etal_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))

He_etal_Nvec$def_short <- gsub("Homeobox protein", "Hox", He_etal_Nvec$Description, ignore.case = TRUE)

DuBuc_etal_Nvec <- read.csv("../output_RNA/marker_genes/Wnt_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))
DuBuc_etal_Nvec$def_short <- DuBuc_etal_Nvec$Gene_Name

HeatStressGenes <- read.csv("../output_RNA/marker_genes/HeatStressGenes_Pacuta.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))
DE_05$query <- rownames(DE_05)
resOrdered$query <- rownames(resOrdered)

join_genes_of_interest <- function(df, gene_set) {
  df %>%
    left_join(gene_set, by = "query") %>%
    select(query, everything()) %>%
    drop_na() %>% distinct()
}

DE_05_biomin         <- join_genes_of_interest(DE_05, Biomin)
DE_05_Biomin_broc    <- join_genes_of_interest(DE_05, Biomin_broc)
DE_05_marker         <- join_genes_of_interest(DE_05, MarkerGenes)
DE_05_marker_broc    <- join_genes_of_interest(DE_05, MarkerGenes_broc)
DE_05_He_etal        <- join_genes_of_interest(DE_05, He_etal_Nvec)
DE_05_DuBuc_etal     <- join_genes_of_interest(DE_05, DuBuc_etal_Nvec)

DESeq_biomin         <- join_genes_of_interest(as.data.frame(resOrdered), Biomin)
DESeq_Biomin_broc    <- join_genes_of_interest(as.data.frame(resOrdered), Biomin_broc)
DESeq_marker         <- join_genes_of_interest(as.data.frame(resOrdered), MarkerGenes)
DESeq_marker_broc    <- join_genes_of_interest(as.data.frame(resOrdered), MarkerGenes_broc)
DESeq_He_etal        <- join_genes_of_interest(as.data.frame(resOrdered), He_etal_Nvec)
DESeq_DuBuc_etal     <- join_genes_of_interest(as.data.frame(resOrdered), DuBuc_etal_Nvec)

DE_05_HeatStressGenes <- HeatStressGenes %>% left_join(DE_05, by="query") %>% select(query,everything()) %>% drop_na(baseMean)
DESeq_HeatStressGenes <- HeatStressGenes %>% left_join(as.data.frame(resOrdered), by="query") %>% select(query,everything())  %>% drop_na(baseMean)

write.csv(as.data.frame(DE_05_biomin), file=paste0(outdir,"/DE_05_biomin_annotation.csv"))
write.csv(as.data.frame(DE_05_marker), file=paste0(outdir,"/DE_05_markergene_annotation.csv"))

biomin_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin) 
biomin_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(Biomin) 

Biomin_broc_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin_broc) 
Biomin_broc_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(Biomin_broc) 

markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes) 
markers_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(MarkerGenes) 

broc_markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes_broc) 
broc_markers_all_res <- as.data.frame(res) %>% mutate(query = rownames(res)) %>% select(query,everything()) %>% left_join(MarkerGenes_broc) 
#view biomin genes that are differentially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_biomin$query, ]), center = TRUE, scale = TRUE))
DE_biomin <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_biomin$def_short, fontsize_row = 5)
DE_biomin
save_ggplot(DE_biomin, "DE_biomin")

#view biomin genes that are differentially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_Biomin_broc$query, ]), center = TRUE, scale = TRUE))
DE_Biomin_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_Biomin_broc$def_short, fontsize_row = 5)
DE_Biomin_broc
save_ggplot(DE_Biomin_broc, "DE_Biomin_broc")

#view marker genes that are differentially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_marker$query, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,cutree_rows = 5,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker$List, fontsize_row = 4)
DE_marker
save_ggplot(DE_marker, "DE_marker")

DE_05_marker_grouped <- DE_05_marker %>% arrange(List) %>% mutate(List = as.factor(List))

z_scores <- t(scale(t(assay(vsd)[DE_05_marker_grouped$quer, ]), center = TRUE, scale = TRUE))
DE_05_marker_grouped_plot <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker_grouped$List, fontsize_row = 5)

DE_05_marker_grouped_plot
save_ggplot(DE_05_marker_grouped_plot, "DE_05_marker_grouped")

#view marker genes that are differentially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_marker_broc$query, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,cutree_rows = 5,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker_broc$List, fontsize_row = 2)
DE_marker
save_ggplot(DE_marker, "DE_marker_broc")

DE_05_marker_broc_grouped <- DE_05_marker_broc %>% arrange(List) %>% mutate(List = as.factor(List))

z_scores <- t(scale(t(assay(vsd)[DE_05_marker_broc_grouped$quer, ]), center = TRUE, scale = TRUE))
DE_05_marker_broc_grouped_plot <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker_broc_grouped$List, fontsize_row = 3)

DE_05_marker_broc_grouped_plot
save_ggplot(DE_05_marker_broc_grouped_plot, "DE_05_marker_broc_grouped")

Visualizing Differential Expression

Biomin_volcano <- EnhancedVolcano(biomin_all_res, 
    lab = biomin_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 40)

save_ggplot(Biomin_volcano, "Biomin_volcano")

Biomin_broc_volcano <- EnhancedVolcano(Biomin_broc_all_res, 
    lab = Biomin_broc_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 40)

save_ggplot(Biomin_broc_volcano, "Biomin_broc_volcano")

Marker_volcano <- EnhancedVolcano(markers_all_res, 
    lab = markers_all_res$List,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(Marker_volcano, "Marker_volcano")

Marker_volcano_names <- EnhancedVolcano(markers_all_res, 
    lab = markers_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(Marker_volcano_names, "Marker_volcano_names")

Marker_volcano <- EnhancedVolcano(broc_markers_all_res, 
    lab = broc_markers_all_res$List,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(Marker_volcano, "Marker_volcano_broc")

EnhancedVolcano(res, 
    lab = rownames(res),
    x = 'log2FoldChange',
    y = 'pvalue')

volcano_plain <- EnhancedVolcano(res, 
    lab = NA,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.05,
    title="",
    subtitle="",
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(volcano_plain, "volcano_plain",width = 4, height = 6, units = "in", dpi = 300)

Trying to annotate un-annotated DE genes:

# wget protein sequence reference file
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz

#get the names of all the DEGs from the first column of the DEG csv file
tail -n +2 ../output_RNA/differential_expression/DEG_05.csv | cut -d',' -f1 | tr -d '"' > ../output_RNA/differential_expression/DEG_05_names.csv

#grep this file against the protein fasta file, first with wc -l to make sure the number of lines is correct (should be your number of DEGs)
grep -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa | wc -l

#grep each header with the protein sequence after ("-A 1") and save to a new file
grep  -A 1 -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa > ../output_RNA/differential_expression/DEG_05_seqs.txt

nr database BLAST of DE_05 genes only

On andromeda:

Blastp-ing only the DE genes against the entire nr database (will take a while)

cd ../scripts
nano DEG_05_blast.sh
#!/bin/bash
#SBATCH --job-name="DE_blast"
#SBATCH -t 240:00:00
#SBATCH --export=NONE
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH --mail-user=zdellaert@uri.edu #your email to send notifications
#SBATCH --mem=500GB
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --account=putnamlab
#SBATCH --nodes=2 --ntasks-per-node=24

module load BLAST+/2.15.0-gompi-2023a

cd ../output_RNA/differential_expression #set working directory
mkdir blast
cd blast

#nr database location andromeda: /data/shared/ncbi-db/.ncbirc 
# points to current location: cat /data/shared/ncbi-db/.ncbirc
# [BLAST]
# BLASTDB=/data/shared/ncbi-db/2024-11-10

blastp -query ../DEG_05_seqs.txt -db nr -out DEG_05_blast_results.txt -outfmt 0 -evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 10

blastp -query ../DEG_05_seqs.txt -db nr -out DEG_05_blast_results_tab.txt -outfmt 6 -evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1

echo "Blast complete!" $(date)
sbatch DEG_05_blast.sh
cd ../output_RNA/differential_expression/blast

wc -l DEG_05_blast_results_tab.txt #3537 of the 3606 genes were annotated

#get just the NCBI database accession numbers for the blast results
cut -f2 DEG_05_blast_results_tab.txt > DEG_05_blast_accessions.txt

#remove any duplicates
sort -u  DEG_05_blast_accessions.txt > unique_DEG_05_blast_accessions.txt

wc -l unique_DEG_05_blast_accessions.txt #3404 of the 3537 annotations were unique

while read acc; do
  curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=$acc&rettype=gp&retmode=text" \
  | grep "DEFINITION" | sed 's/DEFINITION  //g' | awk -v id="$acc" '{print id "\t" $0}'
done < unique_DEG_05_blast_accessions.txt > DEG_05_blast_names.txt

wc -l DEG_05_blast_names.txt #3396 ; unsure why 8 are missing.

join -1 2 -2 1 -t $'\t' <(sort -k2 DEG_05_blast_results_tab.txt) <(sort DEG_05_blast_names.txt) > annotated_DEG_05_blast_results_tab.txt

SwissProt of ALL genes

On unity:

swissprot based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd

Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/

mkdir ../references/blast_dbs
cd ../references/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_10_02.fasta.gz
gunzip -k uniprot_sprot_r2024_10_02.fasta.gz
rm uniprot_sprot_r2024_10_02.fasta.gz

head uniprot_sprot_r2024_10_02.fasta
echo "Number of Sequences"
grep -c ">" uniprot_sprot_r2024_10_02.fasta
# 572214 sequences
module load blast-plus/2.14.1

makeblastdb \
-in ../references/blast_dbs/uniprot_sprot_r2024_10_02.fasta \
-dbtype prot \
-out ../references/blast_dbs/uniprot_sprot_r2024_10_02
cd ../scripts
nano blastp_SwissProt.sh
#!/bin/bash
#SBATCH -t 18:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48
#SBATCH --mem=500GB
#SBATCH --export=NONE
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH -D /project/pi_hputnam_uri_edu/zdellaert/LaserCoral #set working directory

module load blast-plus/2.14.1

cd references/
mkdir annotation

fasta="Pocillopora_acuta_HIv2.genes.pep.faa"

blastp \
-query $fasta \
-db blast_dbs/uniprot_sprot_r2024_10_02 \
-out annotation/blastp_SwissProt_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

echo "Blast complete!" $(date)
cd references/annotation/

tr '|' '\t' <  blastp_SwissProt_out.tab >  blastp_SwissProt_out_sep.tab
cd ../references/annotation/

curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo%2Cgo_id%2Cgo_c%2Cgo_f&format=tsv&query=%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_111524.tsv

wc -l SwissProt-Annot-GO_111524.tsv
#572215

All code below based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd

Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/

bltabl <- read.csv("../references/annotation/blastp_SwissProt_out_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("../references/annotation/SwissProt-Annot-GO_111524.tsv", sep = '\t', header = TRUE)
annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
  select(
    query = V1,
    blast_hit = V3,
    evalue = V13,
    ProteinNames = Protein.names,
    BiologicalProcess = Gene.Ontology..biological.process.,
    GeneOntologyIDs = Gene.Ontology.IDs,
    CellularComponent = Gene.Ontology..cellular.component.,
    MolecularFunction = Gene.Ontology..molecular.function.,
  )

head(annot_tab)
##                                        query blast_hit    evalue
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a    Q4JAI4  1.02e-37
## 2  Pocillopora_acuta_HIv2___RNAseq.g18333.t1    O08807 9.62e-116
## 3   Pocillopora_acuta_HIv2___RNAseq.g7985.t1    O74212 3.56e-158
## 4      Pocillopora_acuta_HIv2___TS.g15308.t1    Q09575  1.08e-12
## 5   Pocillopora_acuta_HIv2___RNAseq.g2057.t1    P0C1P0  8.81e-14
## 6   Pocillopora_acuta_HIv2___RNAseq.g4696.t1    Q9W2Q5  8.98e-69
##                                                                                                                                                                                                     ProteinNames
## 1                                                                                                                                              Methionine synthase (EC 2.1.1.-) (Homocysteine methyltransferase)
## 2 Peroxiredoxin-4 (EC 1.11.1.24) (Antioxidant enzyme AOE372) (Peroxiredoxin IV) (Prx-IV) (Thioredoxin peroxidase AO372) (Thioredoxin-dependent peroxide reductase A0372) (Thioredoxin-dependent peroxiredoxin 4)
## 3                                                                                                   Acyl-lipid (8-3)-desaturase (EC 1.14.19.30) (Delta(5) fatty acid desaturase) (Delta-5 fatty acid desaturase)
## 4                                                                                                                                                                                Uncharacterized protein K02A2.6
## 5                                                                              Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y (Phosphatidylinositol-glycan biosynthesis class Y protein) (PIG-Y)
## 6                                                                                                                                                                   Calcium and integrin-binding family member 2
##                                                                                                                                                                                                                                                                                                                                                                                                                   BiologicalProcess
## 1                                                                                                                                                                                                                                                                                                                                                            methionine biosynthetic process [GO:0009086]; methylation [GO:0032259]
## 2 cell redox homeostasis [GO:0045454]; extracellular matrix organization [GO:0030198]; hydrogen peroxide catabolic process [GO:0042744]; male gonad development [GO:0008584]; negative regulation of male germ cell proliferation [GO:2000255]; protein maturation by protein folding [GO:0022417]; reactive oxygen species metabolic process [GO:0072593]; response to oxidative stress [GO:0006979]; spermatogenesis [GO:0007283]
## 3                                                                                                                                                                                                                                                                                                                 long-chain fatty acid biosynthetic process [GO:0042759]; unsaturated fatty acid biosynthetic process [GO:0006636]
## 4                                                                                                                                                                                                                                                                                                                                                                                                      DNA integration [GO:0015074]
## 5                                                                                                                                                                                                                                                                                                                                                                                      GPI anchor biosynthetic process [GO:0006506]
## 6                                                                                                                                                                                                                                                                                                                                                              calcium ion homeostasis [GO:0055074]; phototransduction [GO:0007602]
##                                                                                                                                                                                                          GeneOntologyIDs
## 1                                                                                                                                                                         GO:0003871; GO:0008270; GO:0009086; GO:0032259
## 2 GO:0005615; GO:0005737; GO:0005739; GO:0005783; GO:0005790; GO:0005829; GO:0006979; GO:0007283; GO:0008379; GO:0008584; GO:0022417; GO:0030198; GO:0042744; GO:0042802; GO:0045454; GO:0072593; GO:0140313; GO:2000255
## 3                                                                                                                                                 GO:0006636; GO:0016020; GO:0020037; GO:0042759; GO:0046872; GO:0102866
## 4                                                                                                                                                 GO:0003676; GO:0005737; GO:0008270; GO:0015074; GO:0019899; GO:0042575
## 5                                                                                                                                                                                                 GO:0000506; GO:0006506
## 6                                                                                                                                                             GO:0000287; GO:0005509; GO:0005737; GO:0007602; GO:0055074
##                                                                                                                                                                           CellularComponent
## 1                                                                                                                                                                                          
## 2 cytoplasm [GO:0005737]; cytosol [GO:0005829]; endoplasmic reticulum [GO:0005783]; extracellular space [GO:0005615]; mitochondrion [GO:0005739]; smooth endoplasmic reticulum [GO:0005790]
## 3                                                                                                                                                                     membrane [GO:0016020]
## 4                                                                                                                               cytoplasm [GO:0005737]; DNA polymerase complex [GO:0042575]
## 5                                                                                               glycosylphosphatidylinositol-N-acetylglucosaminyltransferase (GPI-GnT) complex [GO:0000506]
## 6                                                                                                                                                                    cytoplasm [GO:0005737]
##                                                                                                                    MolecularFunction
## 1        5-methyltetrahydropteroyltriglutamate-homocysteine S-methyltransferase activity [GO:0003871]; zinc ion binding [GO:0008270]
## 2 identical protein binding [GO:0042802]; molecular sequestering activity [GO:0140313]; thioredoxin peroxidase activity [GO:0008379]
## 3        di-homo-gamma-linolenate delta5 desaturase activity [GO:0102866]; heme binding [GO:0020037]; metal ion binding [GO:0046872]
## 4                                      enzyme binding [GO:0019899]; nucleic acid binding [GO:0003676]; zinc ion binding [GO:0008270]
## 5                                                                                                                                   
## 6                                                               calcium ion binding [GO:0005509]; magnesium ion binding [GO:0000287]
write.table(annot_tab, 
            file = "../references/annotation/protein-GO.tsv", 
            sep = "\t", 
            row.names = FALSE, 
            quote = FALSE)

DESeq_SwissProt <- as.data.frame(resOrdered) %>% left_join(annot_tab) %>% select(query,everything()) 
DE_05_SwissProt <- DE_05 %>% left_join(annot_tab) %>% select(query,everything()) 

write.csv(as.data.frame(DE_05_SwissProt), file=paste0(outdir,"/DE_05_SwissProt_annotation.csv"))
DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 30, 
                            paste0(substr(DE_05_SwissProt$ProteinNames, 1, 27), "..."), 
                            DE_05_SwissProt$ProteinNames)

gene_labels <- DE_05_SwissProt %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g6351.t1"  
##  [2] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [5] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [10] "Pocillopora_acuta_HIv2___RNAseq.g3832.t1"  
## [11] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [12] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [13] "Pocillopora_acuta_HIv2___RNAseq.g25431.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [15] "Pocillopora_acuta_HIv2___RNAseq.g9588.t1"  
## [16] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [18] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [19] "Pocillopora_acuta_HIv2___RNAseq.g16202.t1" 
## [20] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [21] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g26604.t1" 
## [24] "Pocillopora_acuta_HIv2___TS.g18100.t1"     
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [27] "Pocillopora_acuta_HIv2___RNAseq.g25327.t1" 
## [28] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g19085.t1" 
## [30] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [31] "Pocillopora_acuta_HIv2___TS.g11360.t1"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [33] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [34] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [35] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [36] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [37] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [38] "Pocillopora_acuta_HIv2___RNAseq.g21374.t1" 
## [39] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [40] "Pocillopora_acuta_HIv2___RNAseq.g5252.t1"  
## [41] "Pocillopora_acuta_HIv2___RNAseq.g22853.t1" 
## [42] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [43] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [44] "Pocillopora_acuta_HIv2___TS.g16802.t1"     
## [45] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [46] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [47] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [48] "Pocillopora_acuta_HIv2___RNAseq.g26594.t2" 
## [49] "Pocillopora_acuta_HIv2___RNAseq.g19115.t1" 
## [50] "Pocillopora_acuta_HIv2___TS.g4983.t1"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "top50_DE_SwissProt")

#view most significantly differentially expressed genes by LFC

select <- order(abs(res$log2FoldChange),decreasing = TRUE)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [2] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [5] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [8] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [10] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [11] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [12] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [13] "Pocillopora_acuta_HIv2___RNAseq.g22261.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [15] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [16] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [18] "Pocillopora_acuta_HIv2___TS.g4983.t1"      
## [19] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [20] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [21] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [24] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [27] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [28] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [30] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [31] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [33] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [34] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [35] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [36] "Pocillopora_acuta_HIv2___RNAseq.g9631.t1"  
## [37] "Pocillopora_acuta_HIv2___RNAseq.g7627.t1"  
## [38] "Pocillopora_acuta_HIv2___RNAseq.g8062.t1"  
## [39] "Pocillopora_acuta_HIv2___RNAseq.g1102.t1"  
## [40] "Pocillopora_acuta_HIv2___RNAseq.g14484.t1" 
## [41] "Pocillopora_acuta_HIv2___RNAseq.g21373.t1" 
## [42] "Pocillopora_acuta_HIv2___TS.g16384.t1"     
## [43] "Pocillopora_acuta_HIv2___RNAseq.g13561.t1" 
## [44] "Pocillopora_acuta_HIv2___RNAseq.g1126.t1"  
## [45] "Pocillopora_acuta_HIv2___RNAseq.g9210.t1"  
## [46] "Pocillopora_acuta_HIv2___RNAseq.g24649.t1" 
## [47] "Pocillopora_acuta_HIv2___RNAseq.g11659.t1" 
## [48] "Pocillopora_acuta_HIv2___TS.g25577.t1a"    
## [49] "Pocillopora_acuta_HIv2___RNAseq.g27681.t1b"
## [50] "Pocillopora_acuta_HIv2___TS.g1968.t2"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "top50_LFC_DE_SwissProt")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "up_Aboral_SwissProt")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "up_OralEpi_SwissProt")

DE_05_SwissProt$short_GO <- ifelse(nchar(DE_05_SwissProt$BiologicalProcess) > 30, 
                            paste0(substr(DE_05_SwissProt$BiologicalProcess, 1, 27), "..."), 
                            DE_05_SwissProt$BiologicalProcess)

gene_labels <- DE_05_SwissProt %>% select(query,short_GO) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "top50_DE_Blast2GO")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "up_Aboral_Blast2GO")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "up_OralEpi_Blast2GO")

Combined annotations manually

Manual <- read.csv(paste0(outdir,"/DE_05_Manual_annotation.csv")) %>% arrange(X) 
Manual <- Manual %>% filter(abs(log2FoldChange) > 1)

Manual$Heatmap_Label <- gsub("Homeobox protein", "Hox", Manual$Heatmap_Label, ignore.case = TRUE)
Manual$Heatmap_Label <- gsub("Protein Wnt", "Wnt", Manual$Heatmap_Label, ignore.case = TRUE)

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes
#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g6351.t1"  
##  [2] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [5] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [10] "Pocillopora_acuta_HIv2___RNAseq.g3832.t1"  
## [11] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [12] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [13] "Pocillopora_acuta_HIv2___RNAseq.g25431.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [15] "Pocillopora_acuta_HIv2___RNAseq.g9588.t1"  
## [16] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [18] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [19] "Pocillopora_acuta_HIv2___RNAseq.g16202.t1" 
## [20] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [21] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g26604.t1" 
## [24] "Pocillopora_acuta_HIv2___TS.g18100.t1"     
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [27] "Pocillopora_acuta_HIv2___RNAseq.g25327.t1" 
## [28] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g19085.t1" 
## [30] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [31] "Pocillopora_acuta_HIv2___TS.g11360.t1"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [33] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [34] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [35] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [36] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [37] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [38] "Pocillopora_acuta_HIv2___RNAseq.g21374.t1" 
## [39] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [40] "Pocillopora_acuta_HIv2___RNAseq.g5252.t1"  
## [41] "Pocillopora_acuta_HIv2___RNAseq.g22853.t1" 
## [42] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [43] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [44] "Pocillopora_acuta_HIv2___TS.g16802.t1"     
## [45] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [46] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [47] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [48] "Pocillopora_acuta_HIv2___RNAseq.g26594.t2" 
## [49] "Pocillopora_acuta_HIv2___RNAseq.g19115.t1" 
## [50] "Pocillopora_acuta_HIv2___TS.g4983.t1"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "top50_DE_Manual")

#view most significantly differentially expressed genes by LFC

select <- order(abs(res$log2FoldChange),decreasing = TRUE)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [2] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [5] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [8] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [10] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [11] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [12] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [13] "Pocillopora_acuta_HIv2___RNAseq.g22261.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [15] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [16] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [18] "Pocillopora_acuta_HIv2___TS.g4983.t1"      
## [19] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [20] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [21] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [24] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [27] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [28] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [30] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [31] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [33] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [34] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [35] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [36] "Pocillopora_acuta_HIv2___RNAseq.g9631.t1"  
## [37] "Pocillopora_acuta_HIv2___RNAseq.g7627.t1"  
## [38] "Pocillopora_acuta_HIv2___RNAseq.g8062.t1"  
## [39] "Pocillopora_acuta_HIv2___RNAseq.g1102.t1"  
## [40] "Pocillopora_acuta_HIv2___RNAseq.g14484.t1" 
## [41] "Pocillopora_acuta_HIv2___RNAseq.g21373.t1" 
## [42] "Pocillopora_acuta_HIv2___TS.g16384.t1"     
## [43] "Pocillopora_acuta_HIv2___RNAseq.g13561.t1" 
## [44] "Pocillopora_acuta_HIv2___RNAseq.g1126.t1"  
## [45] "Pocillopora_acuta_HIv2___RNAseq.g9210.t1"  
## [46] "Pocillopora_acuta_HIv2___RNAseq.g24649.t1" 
## [47] "Pocillopora_acuta_HIv2___RNAseq.g11659.t1" 
## [48] "Pocillopora_acuta_HIv2___TS.g25577.t1a"    
## [49] "Pocillopora_acuta_HIv2___RNAseq.g27681.t1b"
## [50] "Pocillopora_acuta_HIv2___TS.g1968.t2"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "top50_LFC_DE_Manual")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), labels_col = NA, annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 8.5)
up_Aboral
save_ggplot(up_Aboral, "up_Aboral_Manual")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), labels_col = NA, annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 8.5)
up_OralEpi
save_ggplot(up_OralEpi, "up_OralEpi_Manual")

Expression of certain genes of interest: blast against P. acuta genome

yin yang Transctiption factor

cd ../references

#download the genome protein fasta if you have not already
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz

#unzip file
gunzip Pocillopora_acuta_HIv2.genes.pep.faa.gz

In unity

salloc -p cpu -c 8 --mem 32G

module load uri/main
module load BLAST+/2.15.0-gompi-2023a

#make blast_dbs directory if you haven't done so above
mkdir blast_dbs
cd blast_dbs

makeblastdb -in ../Pocillopora_acuta_HIv2.genes.pep.faa -out Pacuta_prot -dbtype prot

cd ../../output_RNA/differential_expression/

#make blast output directory if you haven't done so above
mkdir blast
cd blast

nano YinYang.txt

add accession numbers of interest:

  • nematostella vectensis transcriptional repressor protein YY1 isoforms
    • XP_048585772.1
    • XP_048585773.1
    • XP_048585774.1
  • human transcription factor YY2
    • NP_996806.2
  • human transcription factor ZFP41 (REX-1)
    • NP_777560.2
XP_048585772.1
XP_048585773.1
XP_048585774.1
NP_996806.2
NP_777560.2
# Read the input file line by line and fetch FASTA sequences
while read -r accession; do
  if [[ -n "$accession" ]]; then
    echo "Fetching $accession..."
    curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=${accession}&rettype=fasta&retmode=text" >> "YinYang.fasta"
    echo >> "YinYang.fasta"  # Add a newline between sequences
    sleep 1  # Avoid hitting rate limits
  fi
done < "YinYang.txt"

# run blast with human-readable output
blastp -query YinYang.fasta -db ../../../references/blast_dbs/Pacuta_prot -out YinYang_blast_results.txt -outfmt 0

#looks like there are a lot of matches for each gene! I am going to do a tab search with a very low e-value cutoff:

# run blast with tabular output
blastp -query YinYang.fasta -db ../../../references/blast_dbs/Pacuta_prot -out YinYang_blast_results_tab.txt -outfmt 6 -evalue 1e-25

Great! Interestingly, the same gene, Pocillopora_acuta_HIv2_RNAseq.g25242.t1 is the top match for all 5 proteins I searched. Pocillopora_acuta_HIv2_TS.g24434.t1 is the second best match for all 5. That is interesting! Pocillopora_acuta_HIv2_RNAseq.g7583.t1 is also worth looking at. And Pocillopora_acuta_HIv2_TS.g21338.t1 matched all three YY1 isoforms.

YinYangs <- c("Pocillopora_acuta_HIv2___RNAseq.g25242.t1",
              "Pocillopora_acuta_HIv2___TS.g24434.t1",
              "Pocillopora_acuta_HIv2___RNAseq.g7583.t1",
              "Pocillopora_acuta_HIv2___TS.g21338.t1")

for (i in 1:length(YinYangs)){
  plotCounts(dds, gene=YinYangs[i], intgroup=c("Tissue"),)
}

as.data.frame(resOrdered)[YinYangs,]
##                                             baseMean log2FoldChange     lfcSE
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 2388.43282    -0.65267532 0.8125595
## Pocillopora_acuta_HIv2___TS.g24434.t1       86.14232    -0.16148954 1.0080043
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1   429.08727     0.06121494 0.9629085
## Pocillopora_acuta_HIv2___TS.g21338.t1      277.12637    -0.03463576 1.0095974
##                                              pvalue      padj
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 0.1695296 0.3778825
## Pocillopora_acuta_HIv2___TS.g24434.t1     0.2933826 0.5402220
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1  0.8212358 0.9266210
## Pocillopora_acuta_HIv2___TS.g21338.t1     0.2453329 0.4838418
##                                                                               query
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 Pocillopora_acuta_HIv2___RNAseq.g25242.t1
## Pocillopora_acuta_HIv2___TS.g24434.t1         Pocillopora_acuta_HIv2___TS.g24434.t1
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1   Pocillopora_acuta_HIv2___RNAseq.g7583.t1
## Pocillopora_acuta_HIv2___TS.g21338.t1         Pocillopora_acuta_HIv2___TS.g21338.t1

Okay! None of these genes are differentially expressed between the tissues. That is interesting and good to know. Pocillopora_acuta_HIv2___RNAseq.g25242.t1 has the highest basal expression of all the potential isoforms of this transcription factor.

TRP Channels

DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 80, 
                            paste0(substr(DE_05_SwissProt$ProteinNames, 1, 77), "..."), 
                            DE_05_SwissProt$ProteinNames)

TRP <- Manual %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE)) 
select <- TRP$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 12)
top50_DE
save_ggplot(top50_DE, "TRP_DE_SwissProt", width = 6.43, height = 4.25, units = "in", dpi = 300)

annot_tab$short_name <- ifelse(nchar(annot_tab$ProteinNames) > 80, 
                            paste0(substr(annot_tab$ProteinNames, 1, 77), "..."), 
                            annot_tab$ProteinNames)

swissprot_labels <- annot_tab  %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

TRP <- annot_tab %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE))
select1 <- TRP$query
select <- match(select1,rownames(vsd))
select <- select[!is.na(select)]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = swissprot_labels[match(select1,swissprot_labels$query),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "TRP_SwissProt")

biomin de heatmap nice

DE_05_biomin_filtered <- DE_05_biomin %>% left_join(Manual,by="query" ) #%>% filter(!(Heatmap_Label %in% c("Myosin-9", "Actin, cytoplasmic")))
  
select <- DE_05_biomin_filtered$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_biomin <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
DE_biomin
save_ggplot(DE_biomin, "DE_biomin")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k as needed

# Create a dataframe to manage clusters and reordering
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate heatmap with reordered rows and labels
DE_biomin <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  show_rownames = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8
)
save_ggplot(DE_biomin, "clusters_clean/DE_biomin")

#############################

DE_05_Biomin_broc_filtered <- DE_05_Biomin_broc %>% left_join(Manual,by="query" ) %>% filter(!(Heatmap_Label %in% c("Myosin-9", "Actin, cytoplasmic")))
  
select <- DE_05_Biomin_broc_filtered$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_Biomin_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
DE_Biomin_broc
save_ggplot(DE_Biomin_broc, "DE_Biomin_broc")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k as needed

# Create a dataframe to manage clusters and reordering
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate heatmap with reordered rows and labels
DE_Biomin_broc <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  show_rownames = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8
)
save_ggplot(DE_Biomin_broc, "clusters_clean/DE_Biomin_broc")

vst_counts <- assay(vsd)["Pocillopora_acuta_HIv2___RNAseq.g15280.t1", ]
df <- as.data.frame(colData(dds))
df$expression <- vst_counts

library(ggplot2)
ggplot(df, aes(x=Tissue, y=expression, group=Fragment, color=Fragment)) +
  geom_point(size=3) +
  geom_line() +
  theme_minimal() +
  labs(title="VST Expression by Tissue", y="VST normalized expression")

Marker DE nice

gene_labels <- DE_05_SwissProt %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view marker genes that are differentially expressed
select <- DE_05_marker$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = DE_05_marker$List, fontsize_row = 7)
DE_marker
save_ggplot(DE_marker, "DE_marker_2")

#view marker genes that are differentially expressed
select <- DE_05_marker$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = paste0(DE_05_marker$List,": ", gene_labels[match(select,(gene_labels$query)),2]), fontsize_row = 7)
DE_marker
save_ggplot(DE_marker, "DE_marker_3")

#view marker genes that are differentially expressed
select <- DE_05_marker_broc$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = DE_05_marker_broc$List, fontsize_row = 7)
DE_marker_broc
save_ggplot(DE_marker_broc, "DE_marker_broc_2",height=9)

#view marker genes that are differentially expressed
select <- DE_05_marker_broc$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_marker_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), 
                      cluster_rows=TRUE, show_rownames=TRUE,
                      cluster_cols=TRUE, cutree_cols = 2, cutree_rows = 5,
                      annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
                      labels_row = paste0(DE_05_marker_broc$List,": ", gene_labels[match(select,(gene_labels$query)),2]), fontsize_row = 6)
DE_marker_broc
save_ggplot(DE_marker_broc, "DE_marker_broc_3",height=9)

DE_05_marker_broc <- DE_05_marker_broc %>% left_join(Manual,by=c("query")) %>% select(!ends_with(".y"))

bacteria genes

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

mucin <- Manual %>% filter(grepl("mucin", ProteinNames, ignore.case = TRUE)|
                             grepl("lectin", ProteinNames, ignore.case = TRUE)|
                             grepl("toll-", ProteinNames, ignore.case = TRUE)|
                             grepl("ZP", ProteinNames, ignore.case = TRUE)|
                             
                             grepl("nitric", ProteinNames, ignore.case = TRUE)|
                             grepl("sulfotransferase", ProteinNames, ignore.case = TRUE)|
                             grepl("Complement ", ProteinNames, ignore.case = TRUE)|
                             grepl("toll-like receptor", BiologicalProcess, ignore.case = TRUE)|
                             grepl("NF-kappaB", BiologicalProcess, ignore.case = TRUE)|
                             grepl("complement activation", BiologicalProcess, ignore.case = TRUE)|
                             grepl("3'-phosphoadenosine 5'-phosphosulfate", BiologicalProcess, ignore.case = TRUE)) %>% 
  filter(Heatmap_Label !="Cnidocyte marker protein (Collectin-11)")

select <- unique(mucin$query)
 
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "bacteria_DE_SwissProt",height=12)

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
mucins <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(43) 
)

# Save the heatmap
save_ggplot(mucins, "clusters_clean/bacteria_DE_SwissProt",height=12)

Solute Carrier (SLC) Family genes

solute_all <- DESeq_SwissProt %>% filter(grepl("Solute carrier", ProteinNames, ignore.case = TRUE))
solute_all <- solute_all %>%
  mutate(SLC_family = str_extract(ProteinNames, "Solute carrier family \\d+")) %>%
  mutate(SLC_family = str_replace(SLC_family,"Solute carrier family ", "SLC")) 

nrow(solute_all)
## [1] 211
solute_de <- Manual %>% filter(grepl("Solute carrier", ProteinNames, ignore.case = TRUE))
solute_de <- solute_de %>%
  mutate(SLC_family = str_extract(ProteinNames, "Solute carrier family \\d+")) %>%
  mutate(SLC_family = str_replace(SLC_family,"Solute carrier family ", "SLC")) 


solute_de_upAboral <- solute_de %>% filter(log2FoldChange > 1)
nrow(solute_de_upAboral)
## [1] 9
solute_de_upOral <- solute_de %>% filter(log2FoldChange < -1)
nrow(solute_de_upOral)
## [1] 21
length(unique(solute_all$SLC_family))
## [1] 45
length(unique(solute_de_upAboral$SLC_family))
## [1] 8
length(unique(solute_de_upOral$SLC_family))
## [1] 10
table(solute_de$SLC_family)
## 
##  SLC1 SLC12 SLC15 SLC16 SLC22 SLC23 SLC24 SLC25 SLC30 SLC32 SLC35 SLC46  SLC6 
##     2     3     1     4     2     2     1     2     1     1     1     1     5 
##  SLC7 
##     4
#Basing my annotations on: https://re-solute.eu/knowledgebase/slcfamily and https://slc.bioparadigms.org/

slc_map <- data.frame(
  SLC_family = c(
    "SLC1",
    "SLC12",
    "SLC15",
    "SLC16",
    "SLC22",
    "SLC23",
    "SLC24",
    "SLC25",
    "SLC30",
    "SLC32",
    "SLC35",
    "SLC46",
    "SLC6",
    "SLC7"
  ),
  Function = c(
    "High-affinity glutamate and neutral amino acid transporter family",
    "Electroneutral cation-coupled Cl cotransporter family",
    "Proton oligopeptide cotransporter family",
    "Monocarboxylate transporter family",
    "Organic cation/anion/zwitterion transporter family",
    "Sodium-dependent ascorbic acid transporter family",
    "Na+/(Ca2+-K+) exchanger family",
    "Mitochondrial carrier family",
    "Zinc transporter family",
    "Vesicular inhibitory amino acid transporter family",
    "Nucleoside-sugar transporter family",
    "Folate transporter family",
    "Sodium- and chloride-dependent neurotransmitter transporter family",
    "Cationic amino acid transporter/glycoprotein-associated family"
  ),
  Role = c(
    "Neurotransmission, Glu metabolism, development, aa homeostasis",
    "cell volume, chloride concentration, blood pressure regulation",
    "Immunity, homeostasis of neuropeptides, peptide absoption",
    "glucose homeostasis / metabolism, lipid metabolism",
    "Drug uptake, endocytosis",
    "Absortion and distribution of ascorbic acid",
        "Skin/hair/eye pigmentation, vision, olfaction",
    "Apoptosis, Glucose/Lipid/Amino acid/Urea, metab., ATP synthesis",
    "Zinc homestasis, development, body weight, glucose metabolism",
    "Development, neurotransmission",
    "Virus infection, development, autophagy, cell proliferation",
    "Folate metabolism, Immunity, Drug uptake",
    "Cell differentiation, neurotransmission",
    "Nitric oxide synthesis, redox balance, aa homeostasis"
  ),
  Substrate_Specific = c(
    "Glutamate as Neurotransmitter, anionic, neutral amino acids",
    "Chloride, sodium, potassium",
    "Dipeptides, oligopeptides",
    "Ketone bodies, Lactate, Pyruvate, creatine, aromatic amino acids, T2,T3,T4",
    "Drugs, neurotransmitters, purine and pyrimidine nucleosides, niacin, steroid, carnitine, carnitine derivatives, organic anions",
    "Ascorbic acid",
    "Calcium, Potassium, Sodium",
    "Amino acids, Di- tricarboxylic ions, Co-A, Carnitine/acyl-carnitines, H+, metals, nucleotides",
    "Zinc, Managnese, Lead",
    "GABA, Glycine",
    "Purines, pyrimidines, Thiamine, nuclotide-sugars",
    "Folic acid, Heme, Methotrexate",
    "Neurotransmitters, basic, neutral amino acids",
    "Amino acids"
  ),
  Substrate_Broad = c(
    "amino acids",
    "inorganic ions",
    "peptides",
    "energy metabolites",
    "organic ions",
    "ascorbic acid",
    "inorganic ions",
    "mitochondrial",
    "transition element cations",
    "amino acids; neurotransmitter",
    "nucleosides, nucleotides and nucleotide-sugars",
    "organic ions",
    "neurotransmitter",
    "amino acids"
  )
)
solute_de_annot <- solute_de %>%
  left_join(slc_map, by = "SLC_family") %>%
  mutate(Direction = ifelse(log2FoldChange > 1, "upAboral",
                            ifelse(log2FoldChange < -1, "upOral", "noChange")))

# Summarize counts by family and direction
plot_data <- solute_de_annot %>%
  filter(Direction != "noChange") %>%
  group_by(SLC_family, Substrate_Broad, Direction) %>%
  summarise(n = n(), .groups = "drop")

ggplot(plot_data, aes(x = SLC_family, y = n, fill = Direction)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Substrate_Broad, scales = "free_x") +
  scale_fill_manual(values = c("upOral" = "darkgreen", "upAboral" = "purple")) +
  theme_minimal() +
  labs(title = "Differentially Expressed SLC Genes by Family",
       x = "SLC Family", y = "Number of DE Genes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select <- solute_de$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "SLC_DE_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
SLC <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white", 
  gaps_row = c(21) 
)

# Save the heatmap
save_ggplot(SLC, "clusters_clean/SLC_DE_SwissProt")

temp and light sensing

sensors <- Manual %>% filter(grepl("cellular response to light stimulus", BiologicalProcess, ignore.case = TRUE)|
                               grepl("detection of mechanical stimulus", BiologicalProcess, ignore.case = TRUE)|
                               grepl("thermoception [GO:0050955]", BiologicalProcess, ignore.case = TRUE)) %>% 
                               filter(!grepl("Solute carrier", ProteinNames, ignore.case = TRUE))

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

select <- sensors$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "sensors_DE_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  mutate(cluster = factor(cluster, levels = c(1, 3, 2))) %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
transporters_heat <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = c(11)  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(transporters_heat, "clusters_clean/sensors_DE_SwissProt")

stereocilium

stereo <- DE_05_SwissProt %>% filter(grepl("stereocilium", CellularComponent, ignore.case = TRUE))

stereo
##                                        query   baseMean log2FoldChange
## 1   Pocillopora_acuta_HIv2___RNAseq.g9466.t1 3868.10738      -8.737792
## 2       Pocillopora_acuta_HIv2___TS.g4109.t3  319.81897      -9.837348
## 3   Pocillopora_acuta_HIv2___RNAseq.g7241.t1  474.33368     -10.226756
## 4   Pocillopora_acuta_HIv2___RNAseq.g7243.t1  147.91088     -11.705170
## 5    Pocillopora_acuta_HIv2___RNAseq.10010_t  685.38221      -1.476638
## 6  Pocillopora_acuta_HIv2___RNAseq.g20846.t1  519.71004      -8.729080
## 7  Pocillopora_acuta_HIv2___RNAseq.g25708.t1  635.87560      -1.205273
## 8      Pocillopora_acuta_HIv2___TS.g10198.t2  278.96807      -1.304146
## 9      Pocillopora_acuta_HIv2___TS.g14552.t1 1206.96542      -4.246638
## 10  Pocillopora_acuta_HIv2___RNAseq.g7248.t1   34.88719      -8.607143
## 11 Pocillopora_acuta_HIv2___RNAseq.g20688.t1 4154.69322      -1.937450
## 12  Pocillopora_acuta_HIv2___RNAseq.g6836.t1   25.21477      -7.731032
##        lfcSE       pvalue         padj blast_hit    evalue
## 1  5.4635007 7.820973e-12 7.251445e-10    Q6RI86 1.07e-142
## 2  2.1908720 6.793147e-11 4.163393e-09    Q0ZLH3  8.77e-14
## 3  2.5502784 1.793257e-10 9.431877e-09    Q8WU66  1.99e-36
## 4  3.7496767 1.393834e-08 3.862149e-07    Q8WU66  1.73e-32
## 5  3.5944624 1.005366e-07 2.167156e-06    O75762 6.47e-137
## 6  3.1838656 1.435112e-07 2.965351e-06    Q80ZA4  0.00e+00
## 7  2.1727987 2.401517e-06 3.356091e-05    Q8IVV2  0.00e+00
## 8  2.3870971 2.757473e-06 3.769763e-05    Q9UMZ3  1.08e-40
## 9  1.3703258 9.510383e-06 1.092599e-04    P58365  0.00e+00
## 10 3.9205902 1.308367e-04 1.042827e-03    Q8WU66  9.09e-34
## 11 0.6451179 3.498918e-04 2.411946e-03    Q8WU66  2.96e-18
## 12 4.0831673 6.055546e-04 3.846615e-03    Q8WU66  5.46e-17
##                                                                                                                                                                             ProteinNames
## 1                                                 Transient receptor potential cation channel subfamily A member 1 (Ankyrin-like with transmembrane domains protein 1) (Wasabi receptor)
## 2                                                                                                                                Pejvakin (Autosomal recessive deafness type 59 protein)
## 3                                                                                                       Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 4                                                                                                       Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 5  Transient receptor potential cation channel subfamily A member 1 (Ankyrin-like with transmembrane domains protein 1) (Transformation-sensitive protein p120) (p120) (Wasabi receptor)
## 6                                                                            Fibrocystin-L (Polycystic kidney and hepatic disease 1-like protein 1) (PKHD1-like protein 1) (Protein D86)
## 7                                                                                                                                      Lipoxygenase homology domain-containing protein 1
## 8                                                    Phosphatidylinositol phosphatase PTPRQ (EC 3.1.3.-) (Receptor-type tyrosine-protein phosphatase Q) (PTP-RQ) (R-PTP-Q) (EC 3.1.3.48)
## 9                                                                                                                                                              Cadherin-23 (Otocadherin)
## 10                                                                                                      Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 11                                                                                                      Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 12                                                                                                      Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      BiologicalProcess
## 1  calcium ion transmembrane import into cytosol [GO:0097553]; calcium ion transmembrane transport [GO:0070588]; calcium ion transport [GO:0006816]; cell surface receptor signaling pathway [GO:0007166]; cellular response to caffeine [GO:0071313]; cellular response to carbon dioxide [GO:0071244]; cellular response to cold [GO:0070417]; cellular response to heat [GO:0034605]; cellular response to hydrogen peroxide [GO:0070301]; detection of chemical stimulus involved in sensory perception of pain [GO:0050968]; detection of mechanical stimulus involved in sensory perception [GO:0050974]; detection of mechanical stimulus involved in sensory perception of pain [GO:0050966]; intracellular calcium ion homeostasis [GO:0006874]; positive regulation of insulin secretion involved in cellular response to glucose stimulus [GO:0035774]; positive regulation of monoatomic anion transport [GO:1903793]; protein homotetramerization [GO:0051289]; regulation of blood circulation [GO:1903522]; regulation of neuronal action potential [GO:0098908]; response to cold [GO:0009409]; response to hydrogen peroxide [GO:0042542]; response to organic cyclic compound [GO:0014070]; response to pain [GO:0048265]; response to xenobiotic stimulus [GO:0009410]; sensory perception of pain [GO:0019233]; thermoception [GO:0050955]; urinary bladder smooth muscle contraction [GO:0014832]
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      detection of mechanical stimulus involved in sensory perception of sound [GO:0050910]; pexophagy [GO:0000425]; programmed cell death in response to reactive oxygen species [GO:0097468]; regulation of peroxisome organization [GO:1900063]; response to reactive oxygen species [GO:0000302]; sensory perception of sound [GO:0007605]; stereocilium maintenance [GO:0120045]
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             hair cycle process [GO:0022405]; Notch signaling pathway [GO:0007219]; regulation of Notch signaling pathway [GO:0008593]; sensory perception of sound [GO:0007605]; signal transduction [GO:0007165]; tooth mineralization [GO:0034505]
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             hair cycle process [GO:0022405]; Notch signaling pathway [GO:0007219]; regulation of Notch signaling pathway [GO:0008593]; sensory perception of sound [GO:0007605]; signal transduction [GO:0007165]; tooth mineralization [GO:0034505]
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        calcium ion transmembrane transport [GO:0070588]; cell surface receptor signaling pathway [GO:0007166]; cellular response to hydrogen peroxide [GO:0070301]; detection of chemical stimulus involved in sensory perception of pain [GO:0050968]; detection of mechanical stimulus involved in sensory perception of pain [GO:0050966]; intracellular calcium ion homeostasis [GO:0006874]; monoatomic ion transport [GO:0006811]; protein homotetramerization [GO:0051289]; response to cold [GO:0009409]; response to organic cyclic compound [GO:0014070]; response to pain [GO:0048265]; response to xenobiotic stimulus [GO:0009410]; sensory perception of pain [GO:0019233]; thermoception [GO:0050955]
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             sensory perception of sound [GO:0007605]
## 7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             sensory perception of sound [GO:0007605]
## 8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  regulation of fat cell differentiation [GO:0045598]
## 9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        adult locomotory behavior [GO:0008344]; auditory receptor cell stereocilium organization [GO:0060088]; calcium ion transport [GO:0006816]; cell-cell adhesion [GO:0098609]; cochlea development [GO:0090102]; equilibrioception [GO:0050957]; homophilic cell adhesion via plasma membrane adhesion molecules [GO:0007156]; inner ear auditory receptor cell differentiation [GO:0042491]; inner ear development [GO:0048839]; inner ear morphogenesis [GO:0042472]; inner ear receptor cell stereocilium organization [GO:0060122]; locomotory behavior [GO:0007626]; photoreceptor cell maintenance [GO:0045494]; regulation of cytosolic calcium ion concentration [GO:0051480]; righting reflex [GO:0060013]; sensory perception of light stimulus [GO:0050953]; sensory perception of sound [GO:0007605]
## 10                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            hair cycle process [GO:0022405]; Notch signaling pathway [GO:0007219]; regulation of Notch signaling pathway [GO:0008593]; sensory perception of sound [GO:0007605]; signal transduction [GO:0007165]; tooth mineralization [GO:0034505]
## 11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            hair cycle process [GO:0022405]; Notch signaling pathway [GO:0007219]; regulation of Notch signaling pathway [GO:0008593]; sensory perception of sound [GO:0007605]; signal transduction [GO:0007165]; tooth mineralization [GO:0034505]
## 12                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            hair cycle process [GO:0022405]; Notch signaling pathway [GO:0007219]; regulation of Notch signaling pathway [GO:0008593]; sensory perception of sound [GO:0007605]; signal transduction [GO:0007165]; tooth mineralization [GO:0034505]
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                       GeneOntologyIDs
## 1  GO:0005216; GO:0005245; GO:0005262; GO:0005886; GO:0006816; GO:0006874; GO:0007166; GO:0009409; GO:0009410; GO:0014070; GO:0014832; GO:0015267; GO:0015278; GO:0016324; GO:0019233; GO:0030424; GO:0032421; GO:0034605; GO:0035774; GO:0042542; GO:0042802; GO:0046872; GO:0048265; GO:0050955; GO:0050966; GO:0050968; GO:0050974; GO:0051289; GO:0070301; GO:0070417; GO:0070588; GO:0071244; GO:0071313; GO:0097553; GO:0097604; GO:0098908; GO:1903522; GO:1903793; GO:1990760
## 2                                                                                                                                                                                                                                                                                                                          GO:0000302; GO:0000425; GO:0005737; GO:0005778; GO:0007605; GO:0030864; GO:0035253; GO:0043025; GO:0050910; GO:0097468; GO:0120044; GO:0120045; GO:1900063
## 3                                                                                                                                                                                                                                                                                                                                                              GO:0005576; GO:0007165; GO:0007219; GO:0007605; GO:0008593; GO:0009986; GO:0022405; GO:0032420; GO:0034505; GO:0060170
## 4                                                                                                                                                                                                                                                                                                                                                              GO:0005576; GO:0007165; GO:0007219; GO:0007605; GO:0008593; GO:0009986; GO:0022405; GO:0032420; GO:0034505; GO:0060170
## 5                                                                                                                                                                                                                          GO:0005262; GO:0005886; GO:0006811; GO:0006874; GO:0007166; GO:0009409; GO:0009410; GO:0014070; GO:0015267; GO:0015278; GO:0019233; GO:0032421; GO:0042802; GO:0048265; GO:0050955; GO:0050966; GO:0050968; GO:0051289; GO:0070301; GO:0070588; GO:0097604
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                      GO:0007605; GO:0016020; GO:0032426; GO:0120234
## 7                                                                                                                                                                                                                                                                                                                                                                                                                                                              GO:0007605; GO:0032420
## 8                                                                                                                                                                                                                                                                                                                                                                                                              GO:0004725; GO:0009925; GO:0016324; GO:0043235; GO:0045598; GO:0120044
## 9                                                                                                                          GO:0001917; GO:0005509; GO:0005813; GO:0005886; GO:0005911; GO:0006816; GO:0007156; GO:0007605; GO:0007626; GO:0008344; GO:0032420; GO:0032426; GO:0042472; GO:0042491; GO:0045177; GO:0045202; GO:0045494; GO:0048839; GO:0050953; GO:0050957; GO:0051480; GO:0060013; GO:0060088; GO:0060091; GO:0060122; GO:0090102; GO:0098609; GO:0098683; GO:0098684
## 10                                                                                                                                                                                                                                                                                                                                                             GO:0005576; GO:0007165; GO:0007219; GO:0007605; GO:0008593; GO:0009986; GO:0022405; GO:0032420; GO:0034505; GO:0060170
## 11                                                                                                                                                                                                                                                                                                                                                             GO:0005576; GO:0007165; GO:0007219; GO:0007605; GO:0008593; GO:0009986; GO:0022405; GO:0032420; GO:0034505; GO:0060170
## 12                                                                                                                                                                                                                                                                                                                                                             GO:0005576; GO:0007165; GO:0007219; GO:0007605; GO:0008593; GO:0009986; GO:0022405; GO:0032420; GO:0034505; GO:0060170
##                                                                                                                                                                                                                                                                                                                                                         CellularComponent
## 1                                                                                                                                                                                                                                                  apical plasma membrane [GO:0016324]; axon [GO:0030424]; plasma membrane [GO:0005886]; stereocilium bundle [GO:0032421]
## 2                                                                                                                                                                      ciliary rootlet [GO:0035253]; cortical actin cytoskeleton [GO:0030864]; cytoplasm [GO:0005737]; neuronal cell body [GO:0043025]; peroxisomal membrane [GO:0005778]; stereocilium base [GO:0120044]
## 3                                                                                                                                                                                                                                                  cell surface [GO:0009986]; ciliary membrane [GO:0060170]; extracellular region [GO:0005576]; stereocilium [GO:0032420]
## 4                                                                                                                                                                                                                                                  cell surface [GO:0009986]; ciliary membrane [GO:0060170]; extracellular region [GO:0005576]; stereocilium [GO:0032420]
## 5                                                                                                                                                                                                                                                                                                          plasma membrane [GO:0005886]; stereocilium bundle [GO:0032421]
## 6                                                                                                                                                                                                                                                                                    membrane [GO:0016020]; stereocilium coat [GO:0120234]; stereocilium tip [GO:0032426]
## 7                                                                                                                                                                                                                                                                                                                                               stereocilium [GO:0032420]
## 8                                                                                                                                                                                                                                  apical plasma membrane [GO:0016324]; basal plasma membrane [GO:0009925]; receptor complex [GO:0043235]; stereocilium base [GO:0120044]
## 9  apical part of cell [GO:0045177]; cell-cell junction [GO:0005911]; centrosome [GO:0005813]; cochlear hair cell ribbon synapse [GO:0098683]; kinocilium [GO:0060091]; photoreceptor inner segment [GO:0001917]; photoreceptor ribbon synapse [GO:0098684]; plasma membrane [GO:0005886]; stereocilium [GO:0032420]; stereocilium tip [GO:0032426]; synapse [GO:0045202]
## 10                                                                                                                                                                                                                                                 cell surface [GO:0009986]; ciliary membrane [GO:0060170]; extracellular region [GO:0005576]; stereocilium [GO:0032420]
## 11                                                                                                                                                                                                                                                 cell surface [GO:0009986]; ciliary membrane [GO:0060170]; extracellular region [GO:0005576]; stereocilium [GO:0032420]
## 12                                                                                                                                                                                                                                                 cell surface [GO:0009986]; ciliary membrane [GO:0060170]; extracellular region [GO:0005576]; stereocilium [GO:0032420]
##                                                                                                                                                                                                                                                                                                                                                                                                                           MolecularFunction
## 1  calcium channel activity [GO:0005262]; channel activity [GO:0015267]; identical protein binding [GO:0042802]; intracellularly gated calcium channel activity [GO:0015278]; metal ion binding [GO:0046872]; monoatomic ion channel activity [GO:0005216]; osmolarity-sensing monoatomic cation channel activity [GO:1990760]; temperature-gated cation channel activity [GO:0097604]; voltage-gated calcium channel activity [GO:0005245]
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                          
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                          
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                          
## 5                                                                                                                                                                                                         calcium channel activity [GO:0005262]; channel activity [GO:0015267]; identical protein binding [GO:0042802]; intracellularly gated calcium channel activity [GO:0015278]; temperature-gated cation channel activity [GO:0097604]
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                          
## 7                                                                                                                                                                                                                                                                                                                                                                                                                                          
## 8                                                                                                                                                                                                                                                                                                                                                                                        protein tyrosine phosphatase activity [GO:0004725]
## 9                                                                                                                                                                                                                                                                                                                                                                                                          calcium ion binding [GO:0005509]
## 10                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 11                                                                                                                                                                                                                                                                                                                                                                                                                                         
## 12                                                                                                                                                                                                                                                                                                                                                                                                                                         
##                                                                          short_name
## 1  Transient receptor potential cation channel subfamily A member 1 (Ankyrin-lik...
## 2                           Pejvakin (Autosomal recessive deafness type 59 protein)
## 3  Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 4  Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 5  Transient receptor potential cation channel subfamily A member 1 (Ankyrin-lik...
## 6  Fibrocystin-L (Polycystic kidney and hepatic disease 1-like protein 1) (PKHD1...
## 7                                 Lipoxygenase homology domain-containing protein 1
## 8  Phosphatidylinositol phosphatase PTPRQ (EC 3.1.3.-) (Receptor-type tyrosine-p...
## 9                                                         Cadherin-23 (Otocadherin)
## 10 Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 11 Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
## 12 Thrombospondin-type laminin G domain and EAR repeat-containing protein (TSP-EAR)
##                          short_GO
## 1  calcium ion transmembrane i...
## 2  detection of mechanical sti...
## 3  hair cycle process [GO:0022...
## 4  hair cycle process [GO:0022...
## 5  calcium ion transmembrane t...
## 6  sensory perception of sound...
## 7  sensory perception of sound...
## 8  regulation of fat cell diff...
## 9  adult locomotory behavior [...
## 10 hair cycle process [GO:0022...
## 11 hair cycle process [GO:0022...
## 12 hair cycle process [GO:0022...

Zona pellucida

Amil_ZP <- DESeq_SwissProt %>% filter(blast_hit=="G8HTB6") %>% pull(query) %>% unique()

PFAM_ZP_domain <- annot_all   %>% filter(grepl("Zona_pellucida",Short.name,ignore.case=TRUE))  %>% 
  #filter(!(query %in% Amil_ZP)) %>% 
  pull(query) %>% unique()

Heat Stress Genes

differentially expressed

select <- DE_05_HeatStressGenes %>% arrange(response_type) %>% select(query,gene_name,response_type) %>% unique()
rownames(select) <- select$query

z_scores <- t(scale(t(assay(vsd)[select$query, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_row = (select %>% select(response_type)), annotation_colors = ann_colors,
         labels_row = select$gene_name, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "HeatStress_DE_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select$query,
  Heatmap_Label = select$gene_name,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  annotation_row = (select %>% select(response_type)),
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = 4  # Add breaks only between clusters
)


# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/HeatStress_DE_SwissProt")

all

select <- DESeq_HeatStressGenes %>% arrange(response_type) %>% select(query,gene_name,response_type) %>%
    group_by(query) %>% #collapse duplicate rows
  summarise(
    gene_name = paste(unique(gene_name), collapse = "; "),
    response_type = paste(unique(response_type), collapse = "; "),
    .groups = "drop"
  ) %>% arrange(response_type,gene_name)
select <- data.frame(select)
rownames(select) <- select$query

z_scores <- t(scale(t(assay(vsd)[select$query, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_row = (select %>% select(response_type)), annotation_colors = ann_colors,
         labels_row = select$gene_name, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "HeatStress_all_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select$query,
  Heatmap_Label = select$gene_name,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  annotation_row = (select %>% select(response_type)),
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = 4  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/HeatStress_all_SwissProt")

wnt and hox

WNT <- Manual %>% filter(grepl("wnt", Heatmap_Label, ignore.case = TRUE)|grepl("frizzle", Heatmap_Label, ignore.case = TRUE)|grepl("homeobox", Heatmap_Label, ignore.case = TRUE)|grepl("hox", Heatmap_Label, ignore.case = TRUE))#|grepl("wnt", BiologicalProcess, ignore.case = TRUE))

select <- WNT$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "WNT_Hox_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  gaps_row = 26  # Add breaks only between clusters
)


# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/WNT_Hox_DE_SwissProt")

wnt chromosome

WNT_all <- Manual %>% filter(grepl("wnt", Heatmap_Label, ignore.case = TRUE)|grepl("frizzle", Heatmap_Label, ignore.case = TRUE)|grepl("homeobox", Heatmap_Label, ignore.case = TRUE)|grepl("hox", Heatmap_Label, ignore.case = TRUE))#|grepl("wnt", BiologicalProcess, ignore.case = TRUE))

WNT <- Manual %>% filter(grepl("wnt", Heatmap_Label, ignore.case = TRUE))
FRZ <- Manual %>% filter(grepl("frizzle", Heatmap_Label, ignore.case = TRUE))
HOX <- Manual %>% filter(grepl("hox", Heatmap_Label, ignore.case = TRUE)|grepl("homeobox", Heatmap_Label, ignore.case = TRUE))

select <- WNT$query

gene_layout <- gff_transcripts %>%
  filter(query %in% select) %>%
  arrange(chromosome, start) %>% left_join(WNT) %>% distinct() 

expr_df <- assay(vsd)[select, ] %>%
  as.data.frame() %>%
  rownames_to_column("query") %>%
  pivot_longer(-query, names_to = "Sample", values_to = "vsd_expr") %>%
  left_join(as.data.frame(colData(vsd)), by = "Sample") %>%
  group_by(query, Tissue) %>%
  summarise(mean_expr = mean(vsd_expr), .groups = "drop") %>%
  pivot_wider(names_from = Tissue, values_from = mean_expr)

plot_df <- gene_layout %>%
  left_join(expr_df, by = "query")

plot_df <- plot_df %>%
  mutate(highest_expr_tissue = case_when(
    OralEpi > Aboral ~ "OralEpi",
    Aboral > OralEpi ~ "Aboral",
    TRUE ~ "equal"
  ))

plot_df <- plot_df %>%
  mutate(chromosome_split = case_when(
    chromosome == "Pocillopora_acuta_HIv2___xfSc0000014" & start < 3.6e6 ~ "xfSc0000014_A",
    chromosome == "Pocillopora_acuta_HIv2___xfSc0000014" & start >= 3.65e6 ~ "xfSc0000014_B",
    TRUE ~ chromosome
  ))

plot_df$Heatmap_Label <- gsub("Protein ","",plot_df$Heatmap_Label)

heat_df <- plot_df %>%
  select(query, start, end, width,chromosome, OralEpi, Aboral) %>%
  pivot_longer(cols = c(OralEpi, Aboral), names_to = "tissue", values_to = "expression") %>%
  mutate(midpoint = (start + end) / 2)  # for positioning tile center

heat_df <- heat_df %>%
  left_join(plot_df %>% select(query, chromosome_split), by = "query") %>%
  mutate(chromosome = chromosome_split)

library(gggenes)
library(ggnewscale)

Wnt_chr <- ggplot(plot_df, aes(xmin = start, xmax = end, y = "Hox genes", forward = strand == "+",
                    fill = highest_expr_tissue)) +
  geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1.5, "mm")) +
  geom_label(data = plot_df,
           aes(x = (start + end)/2, y = "_Genes_ ", label = Heatmap_Label),
           vjust = -1.75, size = 2, fill = "lightgrey", label.size = 0.1,fontface="bold")+
  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey")) +
  theme_minimal() +
  facet_wrap(~ chromosome_split, scales = "free_x")+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.spacing = unit(2, "lines")) +
  labs(x = "Genomic position", fill = "Higher expression in")
Wnt_chr

save_ggplot(Wnt_chr, "Wnt_chromosome")

Wnt_chr_heatmap <- ggplot() +
  # Heatmap tiles
  geom_tile(data = heat_df,
            aes(x = midpoint, y = tissue, fill = expression,width = width),colour="black",
            height = 0.5) +
  
  # Color for heatmap (expression level)
  scale_fill_gradient(low = "white", high = "firebrick", name = "Expression") +
  new_scale_fill() +
  
  # Gene arrows
  geom_gene_arrow(data = plot_df,
                  aes(xmin = start, xmax = end, y = "_Genes_",
                      forward = strand == "+", fill = highest_expr_tissue),
                  arrowhead_height = unit(3, "mm"),
                  arrowhead_width = unit(1.5, "mm")) +
    geom_label(data = plot_df,
           aes(x = (start + end)/2, y = "_Genes_", label = Heatmap_Label),
           vjust = 1.5, size = 4, fill = "lightgrey", label.size = 0.1,fontface="bold")+

  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey"),
                    name = "Higher expression in") +

  facet_wrap(~ chromosome_split, scales = "free_x") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 10),
    axis.title.y = element_blank(),
    panel.spacing = unit(.1, "lines")
  ) +scale_y_discrete(expand = expansion(mult = c(1, 0)))+
  labs(x = "Genomic position")

save_ggplot(Wnt_chr_heatmap, "Wnt_chromosome_expression",width = 20, height = 5)

# Shift gene coordinates within each chromosome
plot_df_shifted <- plot_df %>%
  group_by(chromosome) %>%
  mutate(
    chr_start = min(start),
    start_shifted = start - chr_start + 10,  # +10 if you want to avoid 0
    end_shifted   = end - chr_start + 10
  ) %>%
  ungroup()


heat_df_shifted <- plot_df_shifted %>%
  select(query, start_shifted, end_shifted, width,chromosome, OralEpi, Aboral) %>%
  pivot_longer(cols = c(OralEpi, Aboral), names_to = "tissue", values_to = "expression") %>%
  mutate(midpoint = (start_shifted + end_shifted) / 2)  # for positioning tile center

Wnt_chr_heatmap <- ggplot() +
  # Heatmap tiles
  geom_tile(data = heat_df_shifted,
            aes(x = midpoint, y = tissue, fill = expression,width = width),colour="black",
            height = 0.5) +
  
  # Color for heatmap (expression level)
  scale_fill_gradient(low = "white", high = "firebrick", name = "Expression") +
  new_scale_fill() +
  
  # Gene arrows
  geom_gene_arrow(data = plot_df_shifted,
                  aes(xmin = start_shifted, xmax = end_shifted, y = "_Genes_",
                      forward = strand == "+", fill = highest_expr_tissue),
                  arrowhead_height = unit(3, "mm"),
                  arrowhead_width = unit(1.5, "mm")) +
    geom_label(data = plot_df_shifted,
           aes(x = (start_shifted + end_shifted)/2, y = "_Genes_", label = Heatmap_Label),
           vjust = 1.5, size = 4, fill = "lightgrey", label.size = 0.1,fontface="bold")+

  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey"),
                    name = "Higher expression in") +

  facet_wrap(~ chromosome, scales = "fixed",ncol=1) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 10),
    axis.title.y = element_blank(),
    panel.spacing = unit(.1, "lines")
  ) +scale_y_discrete(expand = expansion(mult = c(1, 0)))+
  labs(x = "Genomic position")

save_ggplot(Wnt_chr_heatmap, "Wnt_chromosome_expression_length_preserved",width = 30, height = 5)

Hox

HOX <- Manual %>% filter(grepl("homeobox", Heatmap_Label, ignore.case = TRUE)|grepl("hox", Heatmap_Label, ignore.case = TRUE))
select <- HOX$query

gene_layout <- gff_transcripts %>%
  filter(query %in% select) %>%
  arrange(chromosome, start) %>% left_join(HOX) %>% distinct() 

expr_df <- assay(vsd)[select, ] %>%
  as.data.frame() %>%
  rownames_to_column("query") %>%
  pivot_longer(-query, names_to = "Sample", values_to = "vsd_expr") %>%
  left_join(as.data.frame(colData(vsd)), by = "Sample") %>%
  group_by(query, Tissue) %>%
  summarise(mean_expr = mean(vsd_expr), .groups = "drop") %>%
  pivot_wider(names_from = Tissue, values_from = mean_expr)

plot_df <- gene_layout %>%
  left_join(expr_df, by = "query")

plot_df <- plot_df %>%
  mutate(highest_expr_tissue = case_when(
    OralEpi > Aboral ~ "OralEpi",
    Aboral > OralEpi ~ "Aboral",
    TRUE ~ "equal"
  ))

plot_df <- plot_df %>%
  mutate(chromosome_split = case_when(
    chromosome == "Pocillopora_acuta_HIv2___xfSc0000002" & start < 2.2e6 ~ "xfSc0000002_A",
    chromosome == "Pocillopora_acuta_HIv2___xfSc0000002" & start >= 6.4e6 ~ "xfSc0000002_B",
      chromosome == "Pocillopora_acuta_HIv2___Sc0000011" & start < 2e6 ~ "Sc0000011_A",
      chromosome == "Pocillopora_acuta_HIv2___Sc0000011" & start >= 2e6 ~ "Sc0000011_B",
    chromosome == "Pocillopora_acuta_HIv2___Sc0000024" & start < 420000 ~ "Sc0000024_A",
    chromosome == "Pocillopora_acuta_HIv2___Sc0000024" & start >= 2300000 ~ "Sc0000024_B",
          chromosome == "Pocillopora_acuta_HIv2___xfSc0000008" & start < 2e6 ~ "xfSc0000008_A",
      chromosome == "Pocillopora_acuta_HIv2___xfSc0000008" & start >= 2e6 ~ "xfSc0000008_B",
    TRUE ~ chromosome
  ))

heat_df <- plot_df %>%
  select(query, start, end, width,chromosome, OralEpi, Aboral) %>%
  pivot_longer(cols = c(OralEpi, Aboral), names_to = "tissue", values_to = "expression") %>%
  mutate(midpoint = (start + end) / 2)  # for positioning tile center

heat_df <- heat_df %>%
  left_join(plot_df %>% select(query, chromosome_split), by = "query") %>%
  mutate(chromosome = chromosome_split)

library(gggenes)
library(ggnewscale)

HOX_chr <- ggplot(plot_df[1:6,], aes(xmin = start, xmax = end, y = "Hox genes", forward = strand == "+",
                    fill = highest_expr_tissue)) +
  geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1.5, "mm")) +
  geom_label(aes(x = (start + end)/2, y = "_Genes_ ", label = Heatmap_Label),
           vjust = -1.75, size = 2, fill = "lightgrey", label.size = 0.1,fontface="bold")+
  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey")) +
  theme_minimal() +
  facet_wrap(~ chromosome_split, scales = "free_x",ncol=1)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.spacing = unit(2, "lines")) +
  labs(x = "Genomic position", fill = "Higher expression in")
HOX_chr

 save_ggplot(HOX_chr, "Hox_chromosome")

 HOX_chr <- ggplot(plot_df[7:12,], aes(xmin = start, xmax = end, y = "Hox genes", forward = strand == "+",
                    fill = highest_expr_tissue)) +
  geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1.5, "mm")) +
  geom_label(aes(x = (start + end)/2, y = "_Genes_ ", label = Heatmap_Label),
           vjust = -1.75, size = 2, fill = "lightgrey", label.size = 0.1,fontface="bold")+
  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey")) +
  theme_minimal() +
  facet_wrap(~ chromosome_split, scales = "free_x",ncol=1)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.spacing = unit(2, "lines")) +
  labs(x = "Genomic position", fill = "Higher expression in")
HOX_chr

 save_ggplot(HOX_chr, "Hox_chromosome2")

  HOX_chr <- ggplot(plot_df[13:19,], aes(xmin = start, xmax = end, y = "Hox genes", forward = strand == "+",
                    fill = highest_expr_tissue)) +
  geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1.5, "mm")) +
  geom_label(aes(x = (start + end)/2, y = "_Genes_ ", label = Heatmap_Label),
           vjust = -1.75, size = 2, fill = "lightgrey", label.size = 0.1,fontface="bold")+
  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey")) +
  theme_minimal() +
  facet_wrap(~ chromosome_split, scales = "free_x",ncol=1)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.spacing = unit(2, "lines")) +
  labs(x = "Genomic position", fill = "Higher expression in")
HOX_chr

 save_ggplot(HOX_chr, "Hox_chromosome3")

   HOX_chr <- ggplot(plot_df[20:24,], aes(xmin = start, xmax = end, y = "Hox genes", forward = strand == "+",
                    fill = highest_expr_tissue)) +
  geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1.5, "mm")) +
  geom_label(aes(x = (start + end)/2, y = "_Genes_ ", label = Heatmap_Label),
           vjust = -1.75, size = 2, fill = "lightgrey", label.size = 0.1,fontface="bold")+
  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey")) +
  theme_minimal() +
  facet_wrap(~ chromosome_split, scales = "free_x",ncol=1)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.spacing = unit(2, "lines")) +
  labs(x = "Genomic position", fill = "Higher expression in")
HOX_chr

 save_ggplot(HOX_chr, "Hox_chromosome4")

HOX_chr_heatmap <- ggplot() +
  # Heatmap tiles
  geom_tile(data = heat_df,
            aes(x = midpoint, y = tissue, fill = expression,width = width),colour="black",
            height = 0.5) +
  
  # Color for heatmap (expression level)
  scale_fill_gradient(low = "white", high = "firebrick", name = "Expression") +
  new_scale_fill() +
  
  # Gene arrows
  geom_gene_arrow(data = plot_df,
                  aes(xmin = start, xmax = end, y = "_Genes_",
                      forward = strand == "+", fill = highest_expr_tissue),
                  arrowhead_height = unit(3, "mm"),
                  arrowhead_width = unit(1.5, "mm")) +
    geom_label(data = plot_df,
           aes(x = (start + end)/2, y = "_Genes_", label = Heatmap_Label),
           vjust = 1.5, size = 2 , fill = "lightgrey", label.size = 0.1,fontface="bold")+

  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey"),
                    name = "Higher expression in") +

  facet_wrap(~ chromosome_split, scales = "free_x") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 5),
    axis.title.y = element_blank(),
    panel.spacing = unit(.1, "lines")
  ) +#scale_y_discrete(expand = expansion(mult = c(1, 0)))+
  labs(x = "Genomic position")

save_ggplot(HOX_chr_heatmap, "Hox_chromosome_expression",width = 20, height = 5)

# Shift gene coordinates within each chromosome
plot_df_shifted <- plot_df %>%
  group_by(chromosome_split) %>%
  mutate(
    chr_start = min(start),
    start_shifted = start - chr_start + 10,  # +10 if you want to avoid 0
    end_shifted   = end - chr_start + 10
  ) %>%
  ungroup()


heat_df_shifted <- plot_df_shifted %>%
  select(query, start_shifted, end_shifted, width,chromosome_split, OralEpi, Aboral) %>%
  pivot_longer(cols = c(OralEpi, Aboral), names_to = "tissue", values_to = "expression") %>%
  mutate(midpoint = (start_shifted + end_shifted) / 2)  # for positioning tile center

HOX_chr_heatmap <- ggplot() +
  # Heatmap tiles
  geom_tile(data = heat_df_shifted,
            aes(x = midpoint, y = tissue, fill = expression,width = width),colour="black",
            height = 1) +
  
  # Color for heatmap (expression level)
  scale_fill_gradient(low = "white", high = "firebrick", name = "Expression") +
  new_scale_fill() +
  
  # Gene arrows
  geom_gene_arrow(data = plot_df_shifted,
                  aes(xmin = start_shifted, xmax = end_shifted, y = "_Genes_",
                      forward = strand == "+", fill = highest_expr_tissue),
                  arrowhead_height = unit(3, "mm"),
                  arrowhead_width = unit(1.5, "mm")) +
    geom_label(data = plot_df_shifted,
           aes(x = (start_shifted + end_shifted)/2, y = "_Genes_", label = Heatmap_Label),
           vjust = 1.5, size = 4, fill = "lightgrey", label.size = 0.1,fontface="bold")+

  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey"),
                    name = "Higher expression in") +

  facet_wrap(~ chromosome_split, scales = "fixed",ncol=1) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 10),
    axis.title.y = element_blank(),
    panel.spacing = unit(.1, "lines")
  ) +scale_y_discrete(expand = expansion(mult = c(1, 0)))+
  labs(x = "Genomic position")

save_ggplot_big <- function(plot, filename, width = 10, height = 7, units = "in", dpi = 300) {
  png_path <- file.path(outdir, paste0(filename, ".png"))
  pdf_dir <- file.path(outdir, "pdf_figs")
  pdf_path <- file.path(pdf_dir, paste0(filename, ".pdf"))
  
  # Ensure the pdf_figs directory exists
  if (!dir.exists(pdf_dir)) dir.create(pdf_dir, recursive = TRUE)
  
  # Save plots
  ggsave(filename = png_path, plot = plot, width = width, height = height, units = units, dpi = dpi,limitsize = FALSE)
  ggsave(filename = pdf_path, plot = plot, width = width, height = height, units = units, dpi = dpi,limitsize = FALSE)
}

save_ggplot_big(HOX_chr_heatmap, "Hox_chromosome_expression_length_preserved",width = 15, height = 25)

He et al 2023 TFs/genes of interest Nematostella (NOTE! I confirmed the DE_05 ones are all represented above )

differentially expressed

select <- unique(DE_05_He_etal$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "He_etal_Nvec_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = 4  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/He_etal_Nvec_DE_SwissProt")

#### all

labels <- DESeq_He_etal$def_short
select <- DESeq_He_etal$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = labels, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "He_etal_Nvec_all_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = labels,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  
  gaps_row = 9  # Add breaks only between clusters
)


# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/He_etal_Nvec_all_SwissProt")

gene_layout <- gff_transcripts %>%
  filter(query %in% select) %>%
  arrange(chromosome, start) %>% left_join(DESeq_He_etal) %>% distinct() 

expr_df <- assay(vsd)[select, ] %>%
  as.data.frame() %>%
  rownames_to_column("query") %>%
  pivot_longer(-query, names_to = "Sample", values_to = "vsd_expr") %>%
  left_join(as.data.frame(colData(vsd)), by = "Sample") %>%
  group_by(query, Tissue) %>%
  summarise(mean_expr = mean(vsd_expr), .groups = "drop") %>%
  pivot_wider(names_from = Tissue, values_from = mean_expr)

plot_df <- gene_layout %>%
  left_join(expr_df, by = "query")

plot_df <- plot_df %>%
  mutate(highest_expr_tissue = case_when(
    OralEpi > Aboral ~ "OralEpi",
    Aboral > OralEpi ~ "Aboral",
    TRUE ~ "equal"
  ))

heat_df <- plot_df %>%
  select(query, start, end, width,chromosome, OralEpi, Aboral) %>%
  pivot_longer(cols = c(OralEpi, Aboral), names_to = "tissue", values_to = "expression") %>%
  mutate(midpoint = (start + end) / 2)  # for positioning tile center


library(gggenes)
library(ggnewscale)

Hox_chr <- ggplot(plot_df, aes(xmin = start, xmax = end, y = "Hox genes", forward = strand == "+",
                    fill = highest_expr_tissue)) +
  geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1.5, "mm")) +
  geom_gene_label(aes(label = Gene_Name), size = 50) +
  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey")) +
  theme_minimal() +
  facet_wrap(~ chromosome, scales = "free_x", ncol = 1)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.spacing = unit(.1, "lines")) +
  labs(x = "Genomic position", fill = "Higher expression in")


Hox_chr_heatmap <- ggplot() +
  # Heatmap tiles
  geom_tile(data = heat_df,
            aes(x = midpoint, y = tissue, fill = expression),colour="black",
            height = 0.4) +
  
  # Color for heatmap (expression level)
  scale_fill_gradient(low = "white", high = "firebrick", name = "Expression") +
  new_scale_fill() +
  
  # Gene arrows
  geom_gene_arrow(data = plot_df,
                  aes(xmin = start, xmax = end, y = "_Genes_",
                      forward = strand == "+", fill = highest_expr_tissue),
                  arrowhead_height = unit(3, "mm"),
                  arrowhead_width = unit(1.5, "mm")) +

  geom_gene_label(data = plot_df,
                  aes(xmin = start, xmax = end, y = "_Genes_", label = Gene_Name),
                  size = 50) +

  scale_fill_manual(values = c("OralEpi" = "palegreen3", "Aboral" = "mediumpurple1", "equal" = "grey"),
                    name = "Higher expression in") +
  facet_wrap(~ chromosome, scales = "free_x", ncol = 1) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 10),
    axis.title.y = element_blank(),
    panel.spacing = unit(.1, "lines")
  ) +
  labs(x = "Genomic position")

save_ggplot_big(Hox_chr, "He_etal_Nvec_all_SwissProt_chromosome", width = 20, height = 40)
save_ggplot_big(Hox_chr_heatmap, "He_etal_Nvec_all_SwissProt_chromosome_expression", width = 20, height = 40)

DuBuc et al 2018 Wnt/Aboral-Oral Patterning Genes, interest Nematostella

differentially expressed

select <- unique(DE_05_DuBuc_etal$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "DuBuc_etal_Nvec_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white"
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/DuBuc_etal_Nvec_DE_SwissProt")

combine hox and WNT lists

DESeq_He_etal$source <- "He_etal_2023"
DESeq_DuBuc_etal$source <- "DuBuc_etal_2018"

Hox_Literature <- rbind(DESeq_He_etal,DESeq_DuBuc_etal)

DESeq_Hox_Literature_collapsed <- Hox_Literature %>%
  group_by(query, baseMean, log2FoldChange, lfcSE, pvalue, padj) %>%
  summarize(
    Gene_Name = paste(unique(Gene_Name), collapse = "; "),
    Description = paste(unique(Description), collapse = "; "),
    def_short = paste(unique(def_short), collapse = "; "),
    source = paste(unique(source), collapse = "; "),
    .groups = "drop"
  )

DE_05_Hox_Literature_collapsed <- DESeq_Hox_Literature_collapsed %>% filter(padj<0.05) %>% filter(abs(log2FoldChange) >1)

WNT_HOX_MAN <- Manual %>% select(-X) %>% filter(grepl("wnt", Heatmap_Label, ignore.case = TRUE)|grepl("frizzle", Heatmap_Label, ignore.case = TRUE)|grepl("homeobox", Heatmap_Label, ignore.case = TRUE)|grepl("hox", Heatmap_Label, ignore.case = TRUE)) %>% select(-c(blast_hit,evalue,ProteinNames,BiologicalProcess,GeneOntologyIDs))# %>% rename(Description = Heatmap_Label)

COMBINED_HOX_WNT_all <-  full_join(DESeq_Hox_Literature_collapsed,WNT_HOX_MAN,by=c("query")) %>%
  mutate(
    baseMean = coalesce(baseMean.x, baseMean.y),
    log2FoldChange = coalesce(log2FoldChange.x, log2FoldChange.y),
    lfcSE = coalesce(lfcSE.x, lfcSE.y),
    pvalue = coalesce(pvalue.x, pvalue.y),
    padj = coalesce(padj.x, padj.y)
  ) %>%
  select(-ends_with(".x"), -ends_with(".y"))

COMBINED_HOX_WNT_DE05 <-  full_join(DE_05_Hox_Literature_collapsed,WNT_HOX_MAN,by=c("query")) %>%
  mutate(
    baseMean = coalesce(baseMean.x, baseMean.y),
    log2FoldChange = coalesce(log2FoldChange.x, log2FoldChange.y),
    lfcSE = coalesce(lfcSE.x, lfcSE.y),
    pvalue = coalesce(pvalue.x, pvalue.y),
    padj = coalesce(padj.x, padj.y)
  ) %>%
  select(-ends_with(".x"), -ends_with(".y"))
select <- unique(COMBINED_HOX_WNT_DE05$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "Hox_All_DE")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(2,5,12,14,19,25) 
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/Hox_All_DE")

COMBINED_HOX_WNT_all <- COMBINED_HOX_WNT_all %>%  mutate(Heatmap_Label = coalesce(Heatmap_Label, def_short))
COMBINED_HOX_WNT_all$Heatmap_Label <- gsub("Protein Wnt", "Wnt", COMBINED_HOX_WNT_all$Heatmap_Label, ignore.case = TRUE)

select <- unique(COMBINED_HOX_WNT_all$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(heatmap_metadata%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = COMBINED_HOX_WNT_all$Heatmap_Label, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "Hox_All")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = COMBINED_HOX_WNT_all$Heatmap_Label,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (heatmap_metadata %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  
  gaps_row = c(2,5,16,18,19,24,33) 
)

# Save the heatmap
save_ggplot(top50_DE, "clusters_clean/Hox_All")

random

# Function to generate short names for proteins
generate_short_name <- function(data) {
  data %>%
    mutate(short_name = ifelse(nchar(ProteinNames) > 60, 
                               paste0(substr(ProteinNames, 1, 57), "..."), 
                               ProteinNames))
}

# Function to create gene labels
create_gene_labels <- function(data) {
  data %>%
    select(query, short_name) %>%
    mutate_all(~ ifelse(is.na(.), "", .)) # Replace NAs with ""
}

# Function to generate z-scores for selected genes
calculate_z_scores <- function(data, selection, vsd_matrix) {
  selected_genes <- match(selection, rownames(vsd_matrix))
  selected_genes <- selected_genes[!is.na(selected_genes)] # Remove NAs
  t(scale(t(assay(vsd_matrix)[selected_genes, ]), center = TRUE, scale = TRUE))
}

# Function to create and save heatmap
create_heatmap <- function(z_scores, labels_row, annotation_col, annotation_colors, filename) {
  heatmap <- pheatmap(z_scores, 
                      color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
                      cluster_rows = TRUE, 
                      show_rownames = TRUE,
                      cluster_cols = TRUE, 
                      cutree_cols = 2,
                      annotation_col = annotation_col,
                      annotation_colors = annotation_colors,
                      labels_row = labels_row, 
                      fontsize_row = 6)
  save_ggplot(heatmap, filename)
}

# Process DE_05_SwissProt
DE_05_SwissProt <- generate_short_name(DE_05_SwissProt)
gene_labels <- create_gene_labels(DE_05_SwissProt)
GFP <- DE_05_SwissProt %>% filter(grepl("biolum", BiologicalProcess, ignore.case = TRUE))
select <- GFP$query
z_scores <- calculate_z_scores(DE_05_SwissProt, select, vsd)
create_heatmap(z_scores, 
               labels_row = gene_labels[match(select, gene_labels$query), 2], 
               annotation_col = heatmap_metadata %>% select(Tissue), 
               annotation_colors = ann_colors, 
               filename = "GFP_DE_SwissProt")

# Process annot_tab
annot_tab <- generate_short_name(annot_tab)
gene_labels <- create_gene_labels(annot_tab)
GFP <- annot_tab %>% filter(grepl("biolum", BiologicalProcess, ignore.case = TRUE))
select1 <- GFP$query
z_scores <- calculate_z_scores(annot_tab, select1, vsd)
create_heatmap(z_scores, 
               labels_row = gene_labels[match(select1, gene_labels$query), 2], 
               annotation_col = heatmap_metadata %>% select(Tissue), 
               annotation_colors = ann_colors, 
               filename = "GFP_SwissProt")

Aboral Oral Larvae Paper - Pdam genome

Join our data to Pdam annotations

Annot_Pdam <- read.csv("../references/annotation/blastp_Pdam_out.tab", sep = '\t', header = FALSE) %>% select(c(1,2)) %>% dplyr::rename("protein_id" = "V2", "query" = "V1")
Manual_Pdam <- left_join(Manual, Annot_Pdam) 
library(readxl)
larval_aboral_enriched <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval.xlsx", sheet = "pocillopora_aboral_enriched_05_")

Manual_Pdam_upaboral <- Manual_Pdam %>% filter(log2FoldChange > 0)
inner_join(larval_aboral_enriched, Manual_Pdam_upaboral, by=c("gene ID"="protein_id")) %>% dim()
## [1] 17 26
#18 genes overlap from the 126 in the paper

larval_oral_enriched <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval.xlsx", sheet = "pocillopora_oral_enriched_05_lf")

Manual_Pdam_uporal <- Manual_Pdam %>% filter(log2FoldChange < 0)
inner_join(larval_oral_enriched, Manual_Pdam_uporal, by=c("gene ID"="protein_id")) %>% dim()
## [1]  6 26
inner_join(larval_oral_enriched, Manual_Pdam_uporal, by=c("gene ID"="protein_id")) %>% print()
## # A tibble: 6 × 26
##   `gene ID`      baseMean.x log2FoldChange.x lfcSE.x  stat pvalue.x   padj.x
##   <chr>               <dbl>            <dbl>   <dbl> <dbl>    <dbl>    <dbl>
## 1 XP_027046532.1       88.9             4.29   0.652  5.04 4.54e- 7 8.03e- 5
## 2 XP_027036583.1      563.              3.04   0.287  7.12 1.06e-12 4.11e-10
## 3 XP_027054215.1      918.              2.57   0.275  5.71 1.14e- 8 2.77e- 6
## 4 XP_027054196.1      855.              2.49   0.263  5.65 1.59e- 8 3.76e- 6
## 5 XP_027058731.1     1372.              2.44   0.270  5.35 8.95e- 8 1.85e- 5
## 6 XP_027040620.1     1782.              1.83   0.241  3.45 5.66e- 4 4.98e- 2
## # ℹ 19 more variables: `pfam domain` <chr>, `signal peptide` <chr>,
## #   `astroides ortholog` <chr>, `astroides oral` <chr>,
## #   `clytia ortholog` <chr>, `clytia oral` <chr>, X <int>, query <chr>,
## #   baseMean.y <dbl>, log2FoldChange.y <dbl>, lfcSE.y <dbl>, pvalue.y <dbl>,
## #   padj.y <dbl>, Heatmap_Label <chr>, blast_hit <chr>, evalue <dbl>,
## #   ProteinNames <chr>, BiologicalProcess <chr>, GeneOntologyIDs <chr>
#only 6/83 genes overlap


larval_aboral_clusters <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval_scRNA.xlsx") %>% filter(!is.na(pocillopora))

larval_aboral_clusters_Pacuta <- inner_join(larval_aboral_clusters, Manual_Pdam, by=c("pocillopora"="protein_id"))

#my genes upregulated in oral tissue are high in mucous cells, cool

Updating Renv environment:

After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.

renv::snapshot()